def init(self, config, is_training, input_ids, input_ids2, input_mask, input_mask2, token_type_ids, segment_ids2, use_one_hot_embeddings): with tf.compat.v1.variable_scope(dual_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(dual_model_prefix2): model_2 = BertModel( config=config, is_training=is_training, input_ids=input_ids2, input_mask=input_mask2, token_type_ids=segment_ids2, use_one_hot_embeddings=use_one_hot_embeddings, ) model_1_first_token = model_1.get_sequence_output()[:, 0, :] model_2_first_token = model_2.get_sequence_output()[:, 0, :] rep = tf.concat([model_1_first_token, model_2_first_token], axis=1) self.sequence_output = tf.concat( [model_1.get_sequence_output(), model_2.get_sequence_output()], axis=2) dense_layer = tf.keras.layers.Dense( config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) pooled_output = dense_layer(rep) self.pooled_output = pooled_output
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, features=None, scope=None): super(DualBertTwoInputWithDoubleInputLength, self).__init__() input_ids1 = features["input_ids1"] input_mask1 = features["input_mask1"] segment_ids1 = features["segment_ids1"] input_ids2 = features["input_ids2"] input_mask2 = features["input_mask2"] segment_ids2 = features["segment_ids2"] with tf.compat.v1.variable_scope(dual_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(dual_model_prefix2): model_2 = DoubleLengthInputModel( config, is_training, input_ids1, input_mask1, segment_ids1, input_ids2, input_mask2, segment_ids2, use_one_hot_embeddings=use_one_hot_embeddings, ) model_1_first_token = model_1.get_sequence_output()[:, 0, :] model_2_first_token = model_2.get_sequence_output()[:, 0, :] rep = tf.concat([model_1_first_token, model_2_first_token], axis=1) self.sequence_output = model_1.get_sequence_output() dense_layer = tf.keras.layers.Dense( config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) pooled_output = dense_layer(rep) self.pooled_output = pooled_output
class ProjectedMaxPooling(BertModelInterface): def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, scope=None): super(ProjectedMaxPooling, self).__init__() config = copy.deepcopy(config) self.config = config self.vector_size = config.vector_size self.bert_model = BertModel(config, is_training, input_ids, input_mask, token_type_ids, use_one_hot_embeddings, scope) def get_pooled_output(self): seq_output = self.bert_model.get_sequence_output() # projected = tf.keras.layers.Dense(self.vector_size, # activation=tf.keras.activations.tanh, # kernel_initializer= # create_initializer(self.config.initializer_range))(seq_output) projected = seq_output pooled_output = tf.reduce_mean(projected, axis=1) return pooled_output
class SimpleSharingModel: def __init__( self, config, use_one_hot_embeddings, is_training, masked_input_ids, input_mask, segment_ids, nli_input_ids, nli_input_mask, nli_segment_ids, ): all_input_ids = tf.concat([masked_input_ids, nli_input_ids], axis=0) all_input_mask = tf.concat([input_mask, nli_input_mask], axis=0) all_segment_ids = tf.concat([segment_ids, nli_segment_ids], axis=0) self.batch_size, _ = get_shape_list2(masked_input_ids) self.model = BertModel(config, is_training, all_input_ids, all_input_mask, all_segment_ids, use_one_hot_embeddings) def lm_sequence_output(self): return self.model.get_sequence_output()[:self.batch_size] def get_embedding_table(self): return self.model.get_embedding_table() def get_tt_feature(self): return self.model.get_pooled_output()[self.batch_size:]
class AddLayerSharingModel: def __init__( self, config, use_one_hot_embeddings, is_training, masked_input_ids, input_mask, segment_ids, tt_input_ids, tt_input_mask, tt_segment_ids, ): all_input_ids = tf.concat([masked_input_ids, tt_input_ids], axis=0) all_input_mask = tf.concat([input_mask, tt_input_mask], axis=0) all_segment_ids = tf.concat([segment_ids, tt_segment_ids], axis=0) self.config = config self.lm_batch_size, _ = get_shape_list2(masked_input_ids) self.model = BertModel(config, is_training, all_input_ids, all_input_mask, all_segment_ids, use_one_hot_embeddings) initializer = base.create_initializer(config.initializer_range) self.tt_layer = ForwardLayer(config, initializer) self.tt_input_mask = tt_input_mask seq_output = self.model.get_sequence_output()[self.lm_batch_size:] tt_batch_size, seq_length = get_shape_list2(tt_input_ids) tt_attention_mask = create_attention_mask_from_input_mask2( seq_output, self.tt_input_mask) print('tt_attention_mask', tt_attention_mask.shape) print("seq_output", seq_output.shape) seq_output = self.tt_layer.apply_3d(seq_output, tt_batch_size, seq_length, tt_attention_mask) self.tt_feature = mimic_pooling(seq_output, self.config.hidden_size, self.config.initializer_range) def lm_sequence_output(self): return self.model.get_sequence_output()[:self.lm_batch_size] def get_embedding_table(self): return self.model.get_embedding_table() def get_tt_feature(self): return self.tt_feature
def tlm2_raw_prob(bert_config, use_one_hot_embeddings, input_ids, input_mask, segment_ids): encode_model = BertModel( config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) loss_model = IndependentLossModel(bert_config) loss_model.build_predictions(encode_model.get_sequence_output()) output = -(loss_model.prob1 - loss_model.prob2) return output, loss_model.prob1, loss_model.prob2
def tlm_prefer_hard(bert_config, use_one_hot_embeddings, features): input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] encode_model = BertModel( config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) loss_model = IndependentLossModel(bert_config) loss_model.build_predictions(encode_model.get_sequence_output()) # if score is higher, it is more often sampled output = -loss_model.prob1 return output
def __init__(self, sero_config, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, scope=None): super(DualSeroBertModel, self).__init__() with tf.compat.v1.variable_scope(dual_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(dual_model_prefix2): with tf.compat.v1.variable_scope("sero"): model = SeroEpsilon(sero_config, is_training, use_one_hot_embeddings) batch_size, _ = get_shape_list(input_mask) use_context = tf.ones([batch_size, 1], tf.int32) input_ids = tf.expand_dims(input_ids, 1) input_mask = tf.expand_dims(input_mask, 1) segment_ids = tf.expand_dims(token_type_ids, 1) sequence_output2 = model.network_stacked( input_ids, input_mask, segment_ids, use_context) model_1_first_token = model_1.get_sequence_output()[:, 0, :] model_2_first_token = sequence_output2[:, 0, :] rep = tf.concat([model_1_first_token, model_2_first_token], axis=1) dense_layer = tf.keras.layers.Dense( config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) pooled_output = dense_layer(rep) self.pooled_output = pooled_output
def tlm2(bert_config, use_one_hot_embeddings, features): input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] hp = hyperparams.HPBert() voca_size = 30522 sequence_shape = bert_common.get_shape_list2(input_ids) encode_model = BertModel( config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) loss_model = IndependentLossModel(bert_config) loss_model.build_predictions(encode_model.get_sequence_output()) output = -(loss_model.prob1 - loss_model.prob2) return output
class transformer_nli: def __init__(self, hp, voca_size, method, is_training=True): config = BertConfig( vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False task = Classification(data_generator.NLI.nli_info.num_classes) input_ids = placeholder(tf.int64, [None, seq_length]) input_mask = placeholder(tf.int64, [None, seq_length]) segment_ids = placeholder(tf.int64, [None, seq_length]) label_ids = placeholder(tf.int64, [None]) self.x_list = [input_ids, input_mask, segment_ids] self.y = label_ids use_one_hot_embeddings = use_tpu self.model = BertModel(config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) pred, loss = task.predict(self.model.get_sequence_output(), label_ids, True) self.logits = task.logits self.sout = tf.nn.softmax(self.logits) self.pred = pred self.loss = loss self.acc = task.acc
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, features=None, scope=None): super(TripleBertMasking, self).__init__() input_ids2 = features["input_ids2"] input_mask2 = features["input_mask2"] segment_ids2 = features["segment_ids2"] input_ids3 = features["input_ids3"] input_mask3 = features["input_mask3"] segment_ids3 = features["segment_ids3"] with tf.compat.v1.variable_scope(triple_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(triple_model_prefix2): model_2 = BertModel( config=config, is_training=is_training, input_ids=input_ids2, input_mask=input_mask2, token_type_ids=segment_ids2, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(triple_model_prefix3): model_3 = BertModel( config=config, is_training=is_training, input_ids=input_ids3, input_mask=input_mask3, token_type_ids=segment_ids3, use_one_hot_embeddings=use_one_hot_embeddings, ) model_1_first_token = model_1.get_sequence_output()[:, 0, :] model_2_first_token = model_2.get_sequence_output()[:, 0, :] pooled3 = model_3.get_pooled_output() probs3 = tf.keras.layers.Dense(2, activation=tf.keras.activations.softmax, kernel_initializer=create_initializer( config.initializer_range))(pooled3) mask_scalar = probs3[:, 1:2] self.rel_score = mask_scalar model_2_first_token = mask_scalar * model_2_first_token rep = tf.concat([model_1_first_token, model_2_first_token], axis=1) self.sequence_output = tf.concat( [model_1.get_sequence_output(), model_2.get_sequence_output()], axis=2) dense_layer = tf.keras.layers.Dense( config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) pooled_output = dense_layer(rep) self.pooled_output = pooled_output
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument log_features(features) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] next_sentence_labels = features["next_sentence_labels"] masked_input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights \ = random_masking(input_ids, input_mask, train_config.max_predictions_per_seq, MASK_ID) is_training = (mode == tf.estimator.ModeKeys.TRAIN) prefix1 = "MaybeBERT" prefix2 = "MaybeBFN" with tf.compat.v1.variable_scope(prefix1): model1 = BertModel( config=bert_config, is_training=is_training, input_ids=masked_input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) (masked_lm_loss, masked_lm_example_loss1, masked_lm_log_probs1) = get_masked_lm_output( bert_config, model1.get_sequence_output(), model1.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) masked_lm_example_loss1 = tf.reshape(masked_lm_example_loss1, masked_lm_ids.shape) with tf.compat.v1.variable_scope(prefix2): model2 = BertModel( config=bert_config, is_training=is_training, input_ids=masked_input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) (masked_lm_loss, masked_lm_example_loss2, masked_lm_log_probs2) = get_masked_lm_output( bert_config, model2.get_sequence_output(), model2.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) print(model2.get_sequence_output().shape) masked_lm_example_loss2 = tf.reshape(masked_lm_example_loss2, masked_lm_ids.shape) model = model_class( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) loss_model = IndependentLossModel(bert_config) loss_model.train_modeling(model.get_sequence_output(), masked_lm_positions, masked_lm_weights, tf.stop_gradient(masked_lm_example_loss1), tf.stop_gradient(masked_lm_example_loss2)) total_loss = loss_model.total_loss loss1 = loss_model.loss1 loss2 = loss_model.loss2 per_example_loss1 = loss_model.per_example_loss1 per_example_loss2 = loss_model.per_example_loss2 losses1 = tf.reduce_sum(per_example_loss1, axis=1) losses2 = tf.reduce_sum(per_example_loss2, axis=1) prob1 = loss_model.prob1 prob2 = loss_model.prob2 checkpoint2_1, checkpoint2_2 = train_config.second_init_checkpoint.split( ",") tvars = tf.compat.v1.trainable_variables() initialized_variable_names_1, init_fn_1 = get_init_fn_for_two_checkpoints( train_config, tvars, checkpoint2_1, prefix1, checkpoint2_2, prefix2) assignment_fn = get_bert_assignment_map assignment_map2, initialized_variable_names_2 = assignment_fn( tvars, train_config.init_checkpoint) initialized_variable_names = {} initialized_variable_names.update(initialized_variable_names_1) initialized_variable_names.update(initialized_variable_names_2) def init_fn(): init_fn_1() tf.compat.v1.train.init_from_checkpoint( train_config.init_checkpoint, assignment_map2) scaffold_fn = get_tpu_scaffold_or_init(init_fn, train_config.use_tpu) log_var_assignments(tvars, initialized_variable_names) if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer_from_config( total_loss, train_config) output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss1, per_example_loss2): loss1 = tf.compat.v1.metrics.mean(values=per_example_loss1) loss2 = tf.compat.v1.metrics.mean(values=per_example_loss2) return { "loss1": loss1, "loss2": loss2, } eval_metrics = (metric_fn, [losses1, losses2]) output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: predictions = { "prob1": prob1, "prob2": prob2, "per_example_loss1": per_example_loss1, "per_example_loss2": per_example_loss2, "input_ids": input_ids, "masked_lm_positions": masked_lm_positions, } output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument tf_logging.info("model_fn_apr_lm") """The `model_fn` for TPUEstimator.""" log_features(features) raw_input_ids = features["input_ids"] # [batch_size, seq_length] raw_input_mask = features["input_mask"] raw_segment_ids = features["segment_ids"] word_tokens = features["word"] word_input_mask = tf.cast(tf.not_equal(word_tokens, 0), tf.int32) word_segment_ids = tf.ones_like(word_tokens, tf.int32) if mode == tf.estimator.ModeKeys.PREDICT: tf.random.set_seed(0) seed = 0 else: seed = None input_ids = tf.concat([word_tokens, raw_input_ids], axis=1) input_mask = tf.concat([word_input_mask, raw_input_mask], axis=1) segment_ids = tf.concat([word_segment_ids, raw_segment_ids], axis=1) is_training = (mode == tf.estimator.ModeKeys.TRAIN) tf_logging.info("Using masked_input_ids") masked_input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights \ = random_masking(input_ids, input_mask, train_config.max_predictions_per_seq, MASK_ID, seed) model = BertModel( config=config, is_training=is_training, input_ids=masked_input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) loss = masked_lm_loss tvars = tf.compat.v1.trainable_variables() assignment_fn = tlm.training.assignment_map.get_bert_assignment_map initialized_variable_names, init_fn = get_init_fn( tvars, train_config.init_checkpoint, assignment_fn) scaffold_fn = get_tpu_scaffold_or_init(init_fn, train_config.use_tpu) log_var_assignments(tvars, initialized_variable_names) TPUEstimatorSpec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec if mode == tf.estimator.ModeKeys.TRAIN: tf_logging.info("Using single lr ") train_op = optimization.create_optimizer_from_config( loss, train_config) output_spec = TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: eval_metrics = (metric_fn_lm, [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, ]) output_spec = TPUEstimatorSpec(mode=mode, loss=loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: predictions = { "input_ids": input_ids, "masked_input_ids": masked_input_ids, "masked_lm_ids": masked_lm_ids, "masked_lm_example_loss": masked_lm_example_loss, "masked_lm_positions": masked_lm_positions } output_spec = TPUEstimatorSpec(mode=mode, loss=loss, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" logging.info("*** Features ***") for name in sorted(features.keys()): logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] batch_size, seq_len = get_shape_list2(input_ids) n_trial = 5 logging.info("Doing All Masking") new_input_ids, new_segment_ids, new_input_mask, indice, length_arr = \ candidate_gen(input_ids, input_mask, segment_ids, n_trial) is_training = (mode == tf.estimator.ModeKeys.TRAIN) prefix_cls = "classification" prefix_explain = "explain" all_input_ids = tf.concat([input_ids, new_input_ids], axis=0) all_segment_ids = tf.concat([segment_ids, new_segment_ids], axis=0) all_input_mask = tf.concat([input_mask, new_input_mask], axis=0) with tf.compat.v1.variable_scope(prefix_cls): model = BertModel( config=bert_config, is_training=is_training, input_ids=all_input_ids, input_mask=all_input_mask, token_type_ids=all_segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) output_weights = tf.compat.v1.get_variable( "output_weights", [train_config.num_classes, bert_config.hidden_size], initializer=tf.compat.v1.truncated_normal_initializer( stddev=0.02)) output_bias = tf.compat.v1.get_variable( "output_bias", [train_config.num_classes], initializer=tf.compat.v1.zeros_initializer()) pooled = model.get_pooled_output() raw_logits = tf.matmul(pooled, output_weights, transpose_b=True) logits = tf.stop_gradient(raw_logits) cls_logits = tf.nn.bias_add(logits, output_bias) cls_probs = tf.nn.softmax(cls_logits) orig_probs = cls_probs[:batch_size] new_probs = tf.reshape(cls_probs[batch_size:], [batch_size, n_trial, -1]) best_run, informative = get_informative(new_probs, orig_probs) # informative.shape= [batch_size, num_clases] best_del_idx, best_del_len = select_best(best_run, indice, length_arr) signal_label = get_mask(best_del_idx, best_del_len, seq_len) with tf.compat.v1.variable_scope(prefix_explain): model = BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) seq = model.get_sequence_output() output_weights = tf.compat.v1.get_variable( "output_weights", [train_config.num_classes, bert_config.hidden_size], initializer=tf.compat.v1.truncated_normal_initializer( stddev=0.02)) output_bias = tf.compat.v1.get_variable( "output_bias", [train_config.num_classes], initializer=tf.compat.v1.zeros_initializer()) logits = tf.matmul(seq, output_weights, transpose_b=True) ex_logits = tf.nn.bias_add( logits, output_bias) # [batch, seq_len, num_class] ex_logits_flat = tf.reshape(tf.transpose(ex_logits, [0, 2, 1]), [-1, seq_len]) signal_label_flat = tf.cast(tf.reshape(signal_label, [-1, seq_len]), tf.float32) losses_per_clas_flat = correlation_coefficient_loss( signal_label_flat, ex_logits_flat) # [batch_size, num_class] losses_per_clas = tf.reshape(losses_per_clas_flat, [batch_size, -1]) losses_per_clas = losses_per_clas * tf.cast(informative, tf.float32) losses = tf.reduce_mean(losses_per_clas, axis=1) loss = tf.reduce_mean(losses) tvars = tf.compat.v1.trainable_variables() scaffold_fn = None initialized_variable_names, init_fn = get_init_fn_for_two_checkpoints( train_config, tvars, train_config.init_checkpoint, prefix_explain, train_config.second_init_checkpoint, prefix_cls) if train_config.use_tpu: def tpu_scaffold(): init_fn() return tf.compat.v1.train.Scaffold() scaffold_fn = tpu_scaffold else: init_fn() log_var_assignments(tvars, initialized_variable_names) output_spec = None TPUEstimatorSpec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer_from_config( loss, train_config) output_spec = TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.PREDICT: predictions = { "input_ids": input_ids, "ex_logits": ex_logits, "logits": logits, } output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=None, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" logging.info("*** Features ***") for name in sorted(features.keys()): logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] next_sentence_labels = features["next_sentence_labels"] n_trial = 25 logging.info("Doing All Masking") masked_input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights \ = planned_masking(input_ids, input_mask, train_config.max_predictions_per_seq, MASK_ID, n_trial) is_training = (mode == tf.estimator.ModeKeys.TRAIN) repeat_input_mask = tf.tile(input_mask, [n_trial, 1]) repeat_segment_ids = tf.tile(segment_ids, [n_trial, 1]) prefix1 = "MaybeBERT" prefix2 = "MaybeBFN" with tf.compat.v1.variable_scope(prefix1): model = BertModel( config=bert_config, is_training=is_training, input_ids=masked_input_ids, input_mask=repeat_input_mask, token_type_ids=repeat_segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) (masked_lm_loss, masked_lm_example_loss1, masked_lm_log_probs2) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) with tf.compat.v1.variable_scope(prefix2): model = BertModel( config=bert_config, is_training=is_training, input_ids=masked_input_ids, input_mask=repeat_input_mask, token_type_ids=repeat_segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) (masked_lm_loss, masked_lm_example_loss2, masked_lm_log_probs2) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) n_mask = train_config.max_predictions_per_seq def reform(t): t = tf.reshape(t, [n_trial, -1, n_mask]) t = tf.transpose(t, [1, 0, 2]) return t grouped_positions = reform(masked_lm_positions) grouped_loss1 = reform(masked_lm_example_loss1) grouped_loss2 = reform(masked_lm_example_loss2) tvars = tf.compat.v1.trainable_variables() scaffold_fn = None initialized_variable_names, init_fn = get_init_fn_for_two_checkpoints( train_config, tvars, train_config.init_checkpoint, prefix1, train_config.second_init_checkpoint, prefix2) if train_config.use_tpu: def tpu_scaffold(): init_fn() return tf.compat.v1.train.Scaffold() scaffold_fn = tpu_scaffold else: init_fn() log_var_assignments(tvars, initialized_variable_names) output_spec = None if mode == tf.estimator.ModeKeys.PREDICT: predictions = { "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, "grouped_positions": grouped_positions, "grouped_loss1": grouped_loss1, "grouped_loss2": grouped_loss2, } output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=None, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, features=None, scope=None): super(DualBertTwoInputModelEx, self).__init__() input_ids2 = features["input_ids2"] input_mask2 = features["input_mask2"] segment_ids2 = features["segment_ids2"] modeling_option = config.model_option with tf.compat.v1.variable_scope(dual_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(dual_model_prefix2): model_2 = BertModel( config=config, is_training=is_training, input_ids=input_ids2, input_mask=input_mask2, token_type_ids=segment_ids2, use_one_hot_embeddings=use_one_hot_embeddings, ) model_1_first_token = model_1.get_sequence_output()[:, 0, :] model_2_first_token = model_2.get_sequence_output()[:, 0, :] print('model_2_first_token', model_2_first_token) mask_scalar = { "0": 0., "1": 1., "random": tf.random.uniform(shape=[], minval=0., maxval=1.) }[modeling_option] print("Mask_scalar:", mask_scalar) model_2_first_token = mask_scalar * model_2_first_token print('model_2_first_token', model_2_first_token) rep = tf.concat([model_1_first_token, model_2_first_token], axis=1) self.sequence_output = tf.concat( [model_1.get_sequence_output(), model_2.get_sequence_output()], axis=2) dense_layer = tf.keras.layers.Dense( config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) pooled_output = dense_layer(rep) self.pooled_output = pooled_output
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf_logging.info("*** Features ***") for name in sorted(features.keys()): tf_logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] input_shape = get_shape_list2(input_ids) batch_size, seq_length = input_shape if "is_real_example" in features: is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) else: is_real_example = tf.ones([batch_size, 1], dtype=tf.float32) label_ids = tf.reshape( label_ids, [batch_size, seq_length, train_config.num_classes]) is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = BertModel( config=model_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) seq_out = model.get_sequence_output() if is_training: seq_out = dropout(seq_out, 0.1) logits = tf.keras.layers.Dense(train_config.num_classes, name="cls_dense")(seq_out) probs = tf.math.sigmoid(logits) eps = 1e-10 label_logs = tf.math.log(label_ids + eps) #scale = model_config.scale #label_ids = scale * label_ids is_valid_mask = tf.cast(segment_ids, tf.float32) #loss_arr = tf.keras.losses.MAE(y_true=label_ids, y_pred=probs) loss_arr = tf.keras.losses.MAE(y_true=label_logs, y_pred=logits) loss_arr = loss_arr * is_valid_mask loss = tf.reduce_mean(input_tensor=loss_arr) tvars = tf.compat.v1.trainable_variables() initialized_variable_names = {} scaffold_fn = None if train_config.init_checkpoint: initialized_variable_names, init_fn = get_init_fn( train_config, tvars) scaffold_fn = get_tpu_scaffold_or_init(init_fn, train_config.use_tpu) log_var_assignments(tvars, initialized_variable_names) TPUEstimatorSpec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec def metric_fn(probs, label, is_real_example): cut = math.exp(-10) pred_binary = probs > cut label_binary_all = label > cut pred_binary = pred_binary[:, :, 0] label_binary_1 = label_binary_all[:, :, 1] label_binary_0 = label_binary_all[:, :, 0] precision = tf.compat.v1.metrics.precision(predictions=pred_binary, labels=label_binary_0) recall = tf.compat.v1.metrics.recall(predictions=pred_binary, labels=label_binary_0) true_rate_1 = tf.compat.v1.metrics.mean(label_binary_1) true_rate_0 = tf.compat.v1.metrics.mean(label_binary_0) mae = tf.compat.v1.metrics.mean_absolute_error( labels=label, predictions=probs, weights=is_real_example) return { "mae": mae, "precision": precision, "recall": recall, "true_rate_1": true_rate_1, "true_rate_0": true_rate_0, } output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: tvars = None train_op = optimization.create_optimizer_from_config( loss, train_config, tvars) output_spec = TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: eval_metrics = (metric_fn, [probs, label_ids, is_real_example]) output_spec = TPUEstimatorSpec(mode=mode, loss=loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: predictions = { "input_ids": input_ids, "logits": logits, "label_ids": label_ids, } if "data_id" in features: predictions['data_id'] = features['data_id'] output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" logging.info("*** Features ***") for name in sorted(features.keys()): logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] next_sentence_labels = features["next_sentence_labels"] seed = 0 threshold = 1e-2 logging.info("Doing All Masking") masked_input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights \ = random_masking(input_ids, input_mask, train_config.max_predictions_per_seq, MASK_ID, seed) is_training = (mode == tf.estimator.ModeKeys.TRAIN) prefix1 = "MaybeBERT" prefix2 = "MaybeNLI" with tf.compat.v1.variable_scope(prefix1): model = BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) (masked_lm_loss, masked_lm_example_loss1, masked_lm_log_probs2) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) all_layers1 = model.get_all_encoder_layers() with tf.compat.v1.variable_scope(prefix2): model = BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) all_layers2 = model.get_all_encoder_layers() preserved_infos = [] for a_layer, b_layer in zip(all_layers1, all_layers2): layer_diff = a_layer - b_layer is_preserved = tf.less(tf.abs(layer_diff), threshold) preserved_infos.append(is_preserved) t = tf.cast(preserved_infos[1], dtype=tf.int32) #[batch_size, seq_len, dims] layer_1_count = tf.reduce_sum(t, axis=2) tvars = tf.compat.v1.trainable_variables() initialized_variable_names, init_fn = get_init_fn_for_two_checkpoints(train_config, tvars, train_config.init_checkpoint, prefix1, train_config.second_init_checkpoint, prefix2) scaffold_fn = get_tpu_scaffold_or_init(init_fn, train_config.use_tpu) log_var_assignments(tvars, initialized_variable_names) output_spec = None if mode == tf.estimator.ModeKeys.PREDICT: predictions = { "input_ids": input_ids, "layer_count": layer_1_count } output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=None, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec