def __init__(self, hp, voca_size, is_training=True): config = BertConfig(vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False input_ids = placeholder(tf.int64, [None, seq_length]) input_mask = placeholder(tf.int64, [None, seq_length]) segment_ids = placeholder(tf.int64, [None, seq_length]) label_ids = placeholder(tf.int64, [None]) self.x_list = [input_ids, input_mask, segment_ids] self.y = label_ids use_one_hot_embeddings = use_tpu self.model = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) pooled_output = self.model.get_pooled_output() task = ClassificationB(is_training, hp.hidden_units, 3) task.call(pooled_output, label_ids) self.loss = task.loss self.logits = task.logits self.acc = task.acc
class SimpleSharingModel: def __init__( self, config, use_one_hot_embeddings, is_training, masked_input_ids, input_mask, segment_ids, nli_input_ids, nli_input_mask, nli_segment_ids, ): all_input_ids = tf.concat([masked_input_ids, nli_input_ids], axis=0) all_input_mask = tf.concat([input_mask, nli_input_mask], axis=0) all_segment_ids = tf.concat([segment_ids, nli_segment_ids], axis=0) self.batch_size, _ = get_shape_list2(masked_input_ids) self.model = BertModel(config, is_training, all_input_ids, all_input_mask, all_segment_ids, use_one_hot_embeddings) def lm_sequence_output(self): return self.model.get_sequence_output()[:self.batch_size] def get_embedding_table(self): return self.model.get_embedding_table() def get_tt_feature(self): return self.model.get_pooled_output()[self.batch_size:]
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument tf_logging.info("model_fn_ranking") """The `model_fn` for TPUEstimator.""" log_features(features) input_ids, input_mask, segment_ids = combine_paired_input_features(features) is_training = (mode == tf.estimator.ModeKeys.TRAIN) # Updated model = BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) pooled_output = model.get_pooled_output() if is_training: pooled_output = dropout(pooled_output, 0.1) loss, losses, y_pred = apply_loss_modeling(modeling_opt, pooled_output, features) assignment_fn = assignment_map.get_bert_assignment_map scaffold_fn = checkpoint_init(assignment_fn, train_config) optimizer_factory = lambda x: create_optimizer_from_config(x, train_config) input_ids1 = tf.identity(features["input_ids1"]) input_ids2 = tf.identity(features["input_ids2"]) prediction = { "input_ids1": input_ids1, "input_ids2": input_ids2 } return ranking_estimator_spec(mode, loss, losses, y_pred, scaffold_fn, optimizer_factory, prediction)
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, features=None, scope=None): super(DualBertTwoInputWithDoubleInputLength, self).__init__() input_ids1 = features["input_ids1"] input_mask1 = features["input_mask1"] segment_ids1 = features["segment_ids1"] input_ids2 = features["input_ids2"] input_mask2 = features["input_mask2"] segment_ids2 = features["segment_ids2"] with tf.compat.v1.variable_scope(dual_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(dual_model_prefix2): model_2 = DoubleLengthInputModel( config, is_training, input_ids1, input_mask1, segment_ids1, input_ids2, input_mask2, segment_ids2, use_one_hot_embeddings=use_one_hot_embeddings, ) model_1_first_token = model_1.get_sequence_output()[:, 0, :] model_2_first_token = model_2.get_sequence_output()[:, 0, :] rep = tf.concat([model_1_first_token, model_2_first_token], axis=1) self.sequence_output = model_1.get_sequence_output() dense_layer = tf.keras.layers.Dense( config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) pooled_output = dense_layer(rep) self.pooled_output = pooled_output
def tlm2_raw_prob(bert_config, use_one_hot_embeddings, input_ids, input_mask, segment_ids): encode_model = BertModel( config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) loss_model = IndependentLossModel(bert_config) loss_model.build_predictions(encode_model.get_sequence_output()) output = -(loss_model.prob1 - loss_model.prob2) return output, loss_model.prob1, loss_model.prob2
def init(self, config, is_training, input_ids, input_ids2, input_mask, input_mask2, token_type_ids, segment_ids2, use_one_hot_embeddings): with tf.compat.v1.variable_scope(dual_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(dual_model_prefix2): model_2 = BertModel( config=config, is_training=is_training, input_ids=input_ids2, input_mask=input_mask2, token_type_ids=segment_ids2, use_one_hot_embeddings=use_one_hot_embeddings, ) model_1_first_token = model_1.get_sequence_output()[:, 0, :] model_2_first_token = model_2.get_sequence_output()[:, 0, :] rep = tf.concat([model_1_first_token, model_2_first_token], axis=1) self.sequence_output = tf.concat( [model_1.get_sequence_output(), model_2.get_sequence_output()], axis=2) dense_layer = tf.keras.layers.Dense( config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) pooled_output = dense_layer(rep) self.pooled_output = pooled_output
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument tf_logging.info("model_fn_sero_classification") """The `model_fn` for TPUEstimator.""" log_features(features) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) # Updated model = BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) pooled_output = model.get_pooled_output() if is_training: pooled_output = dropout(pooled_output, 0.1) logits = get_prediction_structure(modeling_opt, pooled_output) loss = 0 tvars = tf.compat.v1.trainable_variables() assignment_fn = assignment_map.get_bert_assignment_map initialized_variable_names, init_fn = get_init_fn(tvars, train_config.init_checkpoint, assignment_fn) scaffold_fn = get_tpu_scaffold_or_init(init_fn, train_config.use_tpu) log_var_assignments(tvars, initialized_variable_names) predictions = None if modeling_opt == "multi_label_hinge": predictions = { "input_ids":input_ids, "logits":logits, } else: predictions = { "input_ids": input_ids, "logits": logits, } useful_inputs = ["data_id", "input_ids2", "data_ids"] for input_name in useful_inputs: if input_name in features: predictions[input_name] = features[input_name] output_spec = rank_predict_estimator_spec(logits, mode, scaffold_fn, predictions) return output_spec
class ProjectedMaxPooling(BertModelInterface): def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, scope=None): super(ProjectedMaxPooling, self).__init__() config = copy.deepcopy(config) self.config = config self.vector_size = config.vector_size self.bert_model = BertModel(config, is_training, input_ids, input_mask, token_type_ids, use_one_hot_embeddings, scope) def get_pooled_output(self): seq_output = self.bert_model.get_sequence_output() # projected = tf.keras.layers.Dense(self.vector_size, # activation=tf.keras.activations.tanh, # kernel_initializer= # create_initializer(self.config.initializer_range))(seq_output) projected = seq_output pooled_output = tf.reduce_mean(projected, axis=1) return pooled_output
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, scope=None): super(ProjectedMaxPooling, self).__init__() config = copy.deepcopy(config) self.config = config self.vector_size = config.vector_size self.bert_model = BertModel(config, is_training, input_ids, input_mask, token_type_ids, use_one_hot_embeddings, scope)
class AddLayerSharingModel: def __init__( self, config, use_one_hot_embeddings, is_training, masked_input_ids, input_mask, segment_ids, tt_input_ids, tt_input_mask, tt_segment_ids, ): all_input_ids = tf.concat([masked_input_ids, tt_input_ids], axis=0) all_input_mask = tf.concat([input_mask, tt_input_mask], axis=0) all_segment_ids = tf.concat([segment_ids, tt_segment_ids], axis=0) self.config = config self.lm_batch_size, _ = get_shape_list2(masked_input_ids) self.model = BertModel(config, is_training, all_input_ids, all_input_mask, all_segment_ids, use_one_hot_embeddings) initializer = base.create_initializer(config.initializer_range) self.tt_layer = ForwardLayer(config, initializer) self.tt_input_mask = tt_input_mask seq_output = self.model.get_sequence_output()[self.lm_batch_size:] tt_batch_size, seq_length = get_shape_list2(tt_input_ids) tt_attention_mask = create_attention_mask_from_input_mask2( seq_output, self.tt_input_mask) print('tt_attention_mask', tt_attention_mask.shape) print("seq_output", seq_output.shape) seq_output = self.tt_layer.apply_3d(seq_output, tt_batch_size, seq_length, tt_attention_mask) self.tt_feature = mimic_pooling(seq_output, self.config.hidden_size, self.config.initializer_range) def lm_sequence_output(self): return self.model.get_sequence_output()[:self.lm_batch_size] def get_embedding_table(self): return self.model.get_embedding_table() def get_tt_feature(self): return self.tt_feature
def __init__(self, config, # This is different from BERT config, is_training, input_ids, input_mask, token_type_ids, use_one_hot_embeddings, features, ): super(MultiContextEncoder, self).__init__() self.config = config if not is_training: config.set_attrib("hidden_dropout_prob", 0.0) config.set_attrib("attention_probs_dropout_prob", 0.0) def reform_context(context): return tf.reshape(context, [-1, config.max_context, config.max_context_length]) batch_size, _ = get_shape_list(input_ids) def combine(input_ids, context_input_ids): a = tf.tile(tf.expand_dims(input_ids, 1), [1, config.max_context, 1]) b = reform_context(context_input_ids) rep_3d = tf.concat([a, b], 2) return tf.reshape(rep_3d, [batch_size * config.max_context, -1]) context_input_ids = features["context_input_ids"] context_input_mask = features["context_input_mask"] context_segment_ids = features["context_segment_ids"] context_segment_ids = tf.ones_like(context_segment_ids, tf.int32) * 2 self.module = BertModel(config=config, is_training=is_training, input_ids=combine(input_ids, context_input_ids), input_mask=combine(input_mask, context_input_mask), token_type_ids=combine(token_type_ids, context_segment_ids), use_one_hot_embeddings=use_one_hot_embeddings, ) dense_layer_setup = tf.keras.layers.Dense(config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) h1 = self.module.get_pooled_output() h2 = dense_layer_setup(h1) h2 = tf.reshape(h2, [batch_size, config.max_context, -1]) h2 = h2[:, :config.num_context] h3 = tf.reduce_mean(h2, axis=1) h4 = dense_layer_setup(h3) self.pooled_output = h4
def tlm_prefer_hard(bert_config, use_one_hot_embeddings, features): input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] encode_model = BertModel( config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) loss_model = IndependentLossModel(bert_config) loss_model.build_predictions(encode_model.get_sequence_output()) # if score is higher, it is more often sampled output = -loss_model.prob1 return output
def __init__(self, sero_config, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, scope=None): super(DualSeroBertModel, self).__init__() with tf.compat.v1.variable_scope(dual_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(dual_model_prefix2): with tf.compat.v1.variable_scope("sero"): model = SeroEpsilon(sero_config, is_training, use_one_hot_embeddings) batch_size, _ = get_shape_list(input_mask) use_context = tf.ones([batch_size, 1], tf.int32) input_ids = tf.expand_dims(input_ids, 1) input_mask = tf.expand_dims(input_mask, 1) segment_ids = tf.expand_dims(token_type_ids, 1) sequence_output2 = model.network_stacked( input_ids, input_mask, segment_ids, use_context) model_1_first_token = model_1.get_sequence_output()[:, 0, :] model_2_first_token = sequence_output2[:, 0, :] rep = tf.concat([model_1_first_token, model_2_first_token], axis=1) dense_layer = tf.keras.layers.Dense( config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) pooled_output = dense_layer(rep) self.pooled_output = pooled_output
def __init__( self, config, use_one_hot_embeddings, is_training, masked_input_ids, input_mask, segment_ids, nli_input_ids, nli_input_mask, nli_segment_ids, ): all_input_ids = tf.concat([masked_input_ids, nli_input_ids], axis=0) all_input_mask = tf.concat([input_mask, nli_input_mask], axis=0) all_segment_ids = tf.concat([segment_ids, nli_segment_ids], axis=0) self.batch_size, _ = get_shape_list2(masked_input_ids) self.model = BertModel(config, is_training, all_input_ids, all_input_mask, all_segment_ids, use_one_hot_embeddings)
def tlm2(bert_config, use_one_hot_embeddings, features): input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] hp = hyperparams.HPBert() voca_size = 30522 sequence_shape = bert_common.get_shape_list2(input_ids) encode_model = BertModel( config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) loss_model = IndependentLossModel(bert_config) loss_model.build_predictions(encode_model.get_sequence_output()) output = -(loss_model.prob1 - loss_model.prob2) return output
def __init__(self, hp, voca_size, method, is_training=True): config = BertConfig( vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False task = Classification(data_generator.NLI.nli_info.num_classes) input_ids = placeholder(tf.int64, [None, seq_length]) input_mask = placeholder(tf.int64, [None, seq_length]) segment_ids = placeholder(tf.int64, [None, seq_length]) label_ids = placeholder(tf.int64, [None]) self.x_list = [input_ids, input_mask, segment_ids] self.y = label_ids use_one_hot_embeddings = use_tpu self.model = BertModel(config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) pred, loss = task.predict(self.model.get_sequence_output(), label_ids, True) self.logits = task.logits self.sout = tf.nn.softmax(self.logits) self.pred = pred self.loss = loss self.acc = task.acc
def __init__(self, config, is_training, use_one_hot_embeddings=True, features=None, scope=None): super(MES_pad, self).__init__() input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] trained_l = config.trained_seq_length data_l = config.data_seq_length batch_size, _ = get_shape_list2(input_ids) add_len = trained_l - data_l zero_pad = tf.zeros([batch_size, add_len], tf.int32) input_ids = tf.concat([input_ids, zero_pad], axis=1) input_mask = tf.concat([input_mask, zero_pad], axis=1) segment_ids = tf.concat([segment_ids, zero_pad], axis=1) # [Batch, unit_seq_length] with tf.compat.v1.variable_scope(dual_model_prefix1): model = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) pooled = model.get_pooled_output() logits_2d = tf.keras.layers.Dense(2, name="cls_dense")(pooled) # with tf.compat.v1.variable_scope(dual_model_prefix2): model = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) logits = tf.keras.layers.Dense(2, name="cls_dense")( model.get_pooled_output()) self.logits = logits label_ids = tf.reshape(label_ids, [-1]) loss_arr = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=label_ids) layer2_loss = tf.reduce_mean(loss_arr) self.loss = layer2_loss
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf_logging.info("*** Features ***") for name in sorted(features.keys()): tf_logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] if mode == tf.estimator.ModeKeys.PREDICT: label_ids = tf.ones([input_ids.shape[0]], dtype=tf.int32) else: label_ids = features["label_ids"] label_ids = tf.reshape(label_ids, [-1]) if "is_real_example" in features: is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) else: is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) domain_ids = features["domain_ids"] domain_ids = tf.reshape(domain_ids, [-1]) is_valid_label = features["is_valid_label"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) model_1 = BertModel( config=model_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) pooled = model_1.get_pooled_output() if is_training: pooled = dropout(pooled, 0.1) logits = tf.keras.layers.Dense(train_config.num_classes, name="cls_dense")(pooled) pred_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=label_ids) num_domain = 2 pooled_for_domain = grad_reverse(pooled) domain_logits = tf.keras.layers.Dense( num_domain, name="domain_dense")(pooled_for_domain) domain_losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=domain_logits, labels=domain_ids) pred_loss = tf.reduce_mean(pred_losses * tf.cast(is_valid_label, tf.float32)) domain_loss = tf.reduce_mean(domain_losses) tf.compat.v1.summary.scalar('domain_loss', domain_loss) tf.compat.v1.summary.scalar('pred_loss', pred_loss) alpha = model_config.alpha loss = pred_loss + alpha * domain_loss tvars = tf.compat.v1.trainable_variables() initialized_variable_names = {} scaffold_fn = None if train_config.init_checkpoint: initialized_variable_names, init_fn = get_init_fn( train_config, tvars) scaffold_fn = get_tpu_scaffold_or_init(init_fn, train_config.use_tpu) log_var_assignments(tvars, initialized_variable_names) TPUEstimatorSpec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: tvars = None train_op = optimization.create_optimizer_from_config( loss, train_config, tvars) output_spec = TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: eval_metrics = (classification_metric_fn, [logits, label_ids, is_real_example]) output_spec = TPUEstimatorSpec(mode=mode, loss=loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: predictions = { "input_ids": input_ids, "logits": logits, } if "data_id" in features: predictions['data_id'] = features['data_id'] output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf_logging.info("*** Features ***") for name in sorted(features.keys()): tf_logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] if mode == tf.estimator.ModeKeys.PREDICT: label_ids = tf.ones([input_ids.shape[0]], dtype=tf.float32) else: label_ids = features["label_ids"] label_ids = tf.reshape(label_ids, [-1]) if "is_real_example" in features: is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) else: is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = BertModel( config=model_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) pooled = model.get_pooled_output() if is_training: pooled = dropout(pooled, 0.1) logits = tf.keras.layers.Dense(train_config.num_classes, name="cls_dense")(pooled) scale = model_config.scale label_ids = scale * label_ids weight = tf.abs(label_ids) loss_arr = tf.keras.losses.MAE(y_true=label_ids, y_pred=logits) loss_arr = loss_arr * weight loss = tf.reduce_mean(input_tensor=loss_arr) tvars = tf.compat.v1.trainable_variables() initialized_variable_names = {} scaffold_fn = None if train_config.init_checkpoint: initialized_variable_names, init_fn = get_init_fn(train_config, tvars) scaffold_fn = get_tpu_scaffold_or_init(init_fn, train_config.use_tpu) log_var_assignments(tvars, initialized_variable_names) TPUEstimatorSpec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec def metric_fn(logits, label, is_real_example): mae = tf.compat.v1.metrics.mean_absolute_error( labels=label, predictions=logits, weights=is_real_example) return { "mae": mae } output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: tvars = None train_op = optimization.create_optimizer_from_config(loss, train_config, tvars) output_spec = TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: eval_metrics = (metric_fn, [ logits, label_ids, is_real_example ]) output_spec = TPUEstimatorSpec(mode=mode, loss=loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: predictions = { "input_ids": input_ids, "logits": logits, } if "data_id" in features: predictions['data_id'] = features['data_id'] output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def __init__(self, config, is_training, use_one_hot_embeddings=True, features=None, scope=None): super(MES_pred_with_layer1, self).__init__() alpha = config.alpha input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] unit_length = config.max_seq_length d_seq_length = config.max_d_seq_length num_window = int(d_seq_length / unit_length) batch_size, _ = get_shape_list2(input_ids) # [Batch, num_window, unit_seq_length] stacked_input_ids, stacked_input_mask, stacked_segment_ids = split_input( input_ids, input_mask, segment_ids, d_seq_length, unit_length) is_first_window = tf.concat([ tf.ones([batch_size, 1], tf.bool), tf.zeros([batch_size, num_window - 1], tf.bool) ], axis=1) num_content_tokens = tf.reduce_sum(stacked_segment_ids, 2) has_enough_evidence = tf.less(10, num_content_tokens) is_valid_window = tf.logical_or(is_first_window, has_enough_evidence) is_valid_window_mask = tf.cast(is_valid_window, tf.float32) self.is_first_window = is_first_window self.num_content_tokens = num_content_tokens self.has_enough_evidence = has_enough_evidence self.is_valid_window = is_valid_window self.is_valid_window_mask = is_valid_window_mask with tf.compat.v1.variable_scope(dual_model_prefix1): model = BertModel( config=config, is_training=is_training, input_ids=r3to2(stacked_input_ids), input_mask=r3to2(stacked_input_mask), token_type_ids=r3to2(stacked_segment_ids), use_one_hot_embeddings=use_one_hot_embeddings, ) def r2to3(arr): return tf.reshape(arr, [batch_size, num_window, -1]) # [Batch, num_window, window_length, hidden_size] pooled = model.get_pooled_output() logits_2d = tf.keras.layers.Dense(2, name="cls_dense")(pooled) # logits_3d = r2to3(logits_2d) label_ids_repeat = tf.tile(label_ids, [1, num_window]) loss_arr = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits_3d, labels=label_ids_repeat) loss_arr = loss_arr * is_valid_window_mask layer1_loss = tf.reduce_mean(loss_arr) probs = tf.nn.softmax(logits_3d)[:, :, 1] # [batch_size, num_window] self.logits = logits_3d # Probabilistic selection def select_seg(stacked_input_ids, indices): # indices : [batch_size, 1] return tf.gather(stacked_input_ids, indices, axis=1, batch_dims=1) valid_probs = probs * is_valid_window_mask max_seg = tf.argmax(valid_probs, axis=1) input_ids = select_seg(stacked_input_ids, max_seg) input_mask = select_seg(stacked_input_mask, max_seg) segment_ids = select_seg(stacked_segment_ids, max_seg) with tf.compat.v1.variable_scope(dual_model_prefix2): model = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) logits = tf.keras.layers.Dense(2, name="cls_dense")( model.get_pooled_output()) label_ids = tf.reshape(label_ids, [-1]) loss_arr = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=label_ids) layer2_loss = tf.reduce_mean(loss_arr) loss = alpha * layer1_loss + layer2_loss self.loss = loss
def __init__(self, config, is_training, use_one_hot_embeddings=True, features=None, scope=None): super(MES_single, self).__init__() alpha = config.alpha input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] unit_length = config.max_seq_length d_seq_length = config.max_d_seq_length num_window = int(d_seq_length / unit_length) batch_size, _ = get_shape_list2(input_ids) # [Batch, num_window, unit_seq_length] stacked_input_ids, stacked_input_mask, stacked_segment_ids = split_input( input_ids, input_mask, segment_ids, d_seq_length, unit_length) # Ignore the window if # 1. The window is not first window and # 1.1 All input_mask is 0 # 1.2 Content is too short, number of document tokens (other than query tokens) < 10 # [Batch, num_window] is_first_window = tf.concat([ tf.ones([batch_size, 1], tf.bool), tf.zeros([batch_size, num_window - 1], tf.bool) ], axis=1) num_content_tokens = tf.reduce_sum(stacked_segment_ids, 2) has_enough_evidence = tf.less(10, num_content_tokens) is_valid_window = tf.logical_or(is_first_window, has_enough_evidence) is_valid_window_mask = tf.cast(is_valid_window, tf.float32) # [batch, num_window] self.is_first_window = is_first_window self.num_content_tokens = num_content_tokens self.has_enough_evidence = has_enough_evidence self.is_valid_window = is_valid_window self.is_valid_window_mask = is_valid_window_mask model = BertModel( config=config, is_training=is_training, input_ids=r3to2(stacked_input_ids), input_mask=r3to2(stacked_input_mask), token_type_ids=r3to2(stacked_segment_ids), use_one_hot_embeddings=use_one_hot_embeddings, ) def r2to3(arr): return tf.reshape(arr, [batch_size, num_window, -1]) # [Batch, num_window, window_length, hidden_size] pooled = model.get_pooled_output() logits_2d = tf.keras.layers.Dense(2, name="cls_dense")(pooled) # logits_3d = r2to3(logits_2d) label_ids_repeat = tf.tile(label_ids, [1, num_window]) # [batch, num_window] loss_arr = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits_3d, labels=label_ids_repeat) loss_arr = loss_arr * is_valid_window_mask probs = tf.nn.softmax(logits_3d)[:, :, 1] # [batch_size, num_window] max_prob_window = tf.argmax(probs, axis=1) beta = 10 loss_weight = tf.nn.softmax(probs * is_valid_window_mask * beta) loss_weight = loss_weight * is_valid_window_mask # apply loss if it is max loss = tf.reduce_mean(loss_arr * loss_weight) logits = tf.gather(logits_3d, max_prob_window, axis=1, batch_dims=1) self.logits = logits self.loss = loss
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" log_features(features) def reform_a_input(raw_input): return tf.reshape(raw_input, [dict_run_config.inner_batch_size, -1]) def reform_b_input(raw_input): return tf.reshape(raw_input, [dict_run_config.def_per_batch, -1]) input_ids = reform_a_input(features["input_ids"]) input_mask = reform_a_input(features["input_mask"]) segment_ids = reform_a_input(features["segment_ids"]) tf_logging.info("input_ids, input_mask") # input_ids = features["input_ids"] # input_mask = features["input_mask"] # segment_ids = features["segment_ids"] if mode == tf.estimator.ModeKeys.PREDICT: tf.random.set_seed(0) seed = 0 else: seed = None # tf_logging.info("Doing dynamic masking (random)") # masked_input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights \ # = random_masking(input_ids, input_mask, train_config.max_predictions_per_seq, MASK_ID, seed) # if dict_run_config.prediction_op == "loss_fixed_mask" or train_config.fixed_mask: masked_input_ids = input_ids masked_lm_positions = reform_a_input(features["masked_lm_positions"]) masked_lm_ids = reform_a_input(features["masked_lm_ids"]) masked_lm_weights = reform_a_input(features["masked_lm_weights"]) is_training = (mode == tf.estimator.ModeKeys.TRAIN) if model_name == "APR": model = APR( masked_input_ids, input_mask, segment_ids, is_training, train_config.use_one_hot_embeddings, bert_config, ssdr_config, dict_run_config.def_per_batch, dict_run_config.inner_batch_size, dict_run_config.max_def_length, ) elif model_name == "BERT": model = BertModel( config=bert_config, is_training=is_training, input_ids=masked_input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) else: assert False masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs \ = get_masked_lm_output(bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) loss = masked_lm_loss tvars = tf.compat.v1.trainable_variables() assignment_fn = dict_model_fn.get_bert_assignment_map_for_dict initialized_variable_names, init_fn = align_checkpoint_twice( tvars, train_config.init_checkpoint, assignment_fn) scaffold_fn = get_tpu_scaffold_or_init(init_fn, train_config.use_tpu) log_var_assignments(tvars, initialized_variable_names) TPUEstimatorSpec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec if mode == tf.estimator.ModeKeys.TRAIN: if ssdr_config.compare_attrib_value_safe("use_two_lr", True): tf_logging.info("Using two lr for each parts") train_op = create_optimizer_with_separate_lr( loss, train_config) else: tf_logging.info("Using single lr ") train_op = optimization.create_optimizer_from_config( loss, train_config) output_spec = TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=[OomReportingHook()], scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: eval_metrics = (metric_fn_lm, [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, ]) output_spec = TPUEstimatorSpec(mode=mode, loss=loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: predictions = { "input_ids": input_ids, "masked_input_ids": masked_input_ids, "masked_lm_ids": masked_lm_ids, "masked_lm_example_loss": masked_lm_example_loss, "masked_lm_positions": masked_lm_positions, } output_spec = TPUEstimatorSpec(mode=mode, loss=loss, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" logging.info("*** Features ***") for name in sorted(features.keys()): logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] next_sentence_labels = features["next_sentence_labels"] seed = 0 threshold = 1e-2 logging.info("Doing All Masking") masked_input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights \ = random_masking(input_ids, input_mask, train_config.max_predictions_per_seq, MASK_ID, seed) is_training = (mode == tf.estimator.ModeKeys.TRAIN) prefix1 = "MaybeBERT" prefix2 = "MaybeNLI" with tf.compat.v1.variable_scope(prefix1): model = BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) (masked_lm_loss, masked_lm_example_loss1, masked_lm_log_probs2) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) all_layers1 = model.get_all_encoder_layers() with tf.compat.v1.variable_scope(prefix2): model = BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) all_layers2 = model.get_all_encoder_layers() preserved_infos = [] for a_layer, b_layer in zip(all_layers1, all_layers2): layer_diff = a_layer - b_layer is_preserved = tf.less(tf.abs(layer_diff), threshold) preserved_infos.append(is_preserved) t = tf.cast(preserved_infos[1], dtype=tf.int32) #[batch_size, seq_len, dims] layer_1_count = tf.reduce_sum(t, axis=2) tvars = tf.compat.v1.trainable_variables() initialized_variable_names, init_fn = get_init_fn_for_two_checkpoints(train_config, tvars, train_config.init_checkpoint, prefix1, train_config.second_init_checkpoint, prefix2) scaffold_fn = get_tpu_scaffold_or_init(init_fn, train_config.use_tpu) log_var_assignments(tvars, initialized_variable_names) output_spec = None if mode == tf.estimator.ModeKeys.PREDICT: predictions = { "input_ids": input_ids, "layer_count": layer_1_count } output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=None, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" logging.info("*** Features ***") for name in sorted(features.keys()): logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] batch_size, seq_len = get_shape_list2(input_ids) n_trial = 5 logging.info("Doing All Masking") new_input_ids, new_segment_ids, new_input_mask, indice, length_arr = \ candidate_gen(input_ids, input_mask, segment_ids, n_trial) is_training = (mode == tf.estimator.ModeKeys.TRAIN) prefix_cls = "classification" prefix_explain = "explain" all_input_ids = tf.concat([input_ids, new_input_ids], axis=0) all_segment_ids = tf.concat([segment_ids, new_segment_ids], axis=0) all_input_mask = tf.concat([input_mask, new_input_mask], axis=0) with tf.compat.v1.variable_scope(prefix_cls): model = BertModel( config=bert_config, is_training=is_training, input_ids=all_input_ids, input_mask=all_input_mask, token_type_ids=all_segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) output_weights = tf.compat.v1.get_variable( "output_weights", [train_config.num_classes, bert_config.hidden_size], initializer=tf.compat.v1.truncated_normal_initializer( stddev=0.02)) output_bias = tf.compat.v1.get_variable( "output_bias", [train_config.num_classes], initializer=tf.compat.v1.zeros_initializer()) pooled = model.get_pooled_output() raw_logits = tf.matmul(pooled, output_weights, transpose_b=True) logits = tf.stop_gradient(raw_logits) cls_logits = tf.nn.bias_add(logits, output_bias) cls_probs = tf.nn.softmax(cls_logits) orig_probs = cls_probs[:batch_size] new_probs = tf.reshape(cls_probs[batch_size:], [batch_size, n_trial, -1]) best_run, informative = get_informative(new_probs, orig_probs) # informative.shape= [batch_size, num_clases] best_del_idx, best_del_len = select_best(best_run, indice, length_arr) signal_label = get_mask(best_del_idx, best_del_len, seq_len) with tf.compat.v1.variable_scope(prefix_explain): model = BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) seq = model.get_sequence_output() output_weights = tf.compat.v1.get_variable( "output_weights", [train_config.num_classes, bert_config.hidden_size], initializer=tf.compat.v1.truncated_normal_initializer( stddev=0.02)) output_bias = tf.compat.v1.get_variable( "output_bias", [train_config.num_classes], initializer=tf.compat.v1.zeros_initializer()) logits = tf.matmul(seq, output_weights, transpose_b=True) ex_logits = tf.nn.bias_add( logits, output_bias) # [batch, seq_len, num_class] ex_logits_flat = tf.reshape(tf.transpose(ex_logits, [0, 2, 1]), [-1, seq_len]) signal_label_flat = tf.cast(tf.reshape(signal_label, [-1, seq_len]), tf.float32) losses_per_clas_flat = correlation_coefficient_loss( signal_label_flat, ex_logits_flat) # [batch_size, num_class] losses_per_clas = tf.reshape(losses_per_clas_flat, [batch_size, -1]) losses_per_clas = losses_per_clas * tf.cast(informative, tf.float32) losses = tf.reduce_mean(losses_per_clas, axis=1) loss = tf.reduce_mean(losses) tvars = tf.compat.v1.trainable_variables() scaffold_fn = None initialized_variable_names, init_fn = get_init_fn_for_two_checkpoints( train_config, tvars, train_config.init_checkpoint, prefix_explain, train_config.second_init_checkpoint, prefix_cls) if train_config.use_tpu: def tpu_scaffold(): init_fn() return tf.compat.v1.train.Scaffold() scaffold_fn = tpu_scaffold else: init_fn() log_var_assignments(tvars, initialized_variable_names) output_spec = None TPUEstimatorSpec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer_from_config( loss, train_config) output_spec = TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.PREDICT: predictions = { "input_ids": input_ids, "ex_logits": ex_logits, "logits": logits, } output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=None, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" logging.info("*** Features ***") for name in sorted(features.keys()): logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] next_sentence_labels = features["next_sentence_labels"] n_trial = 25 logging.info("Doing All Masking") masked_input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights \ = planned_masking(input_ids, input_mask, train_config.max_predictions_per_seq, MASK_ID, n_trial) is_training = (mode == tf.estimator.ModeKeys.TRAIN) repeat_input_mask = tf.tile(input_mask, [n_trial, 1]) repeat_segment_ids = tf.tile(segment_ids, [n_trial, 1]) prefix1 = "MaybeBERT" prefix2 = "MaybeBFN" with tf.compat.v1.variable_scope(prefix1): model = BertModel( config=bert_config, is_training=is_training, input_ids=masked_input_ids, input_mask=repeat_input_mask, token_type_ids=repeat_segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) (masked_lm_loss, masked_lm_example_loss1, masked_lm_log_probs2) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) with tf.compat.v1.variable_scope(prefix2): model = BertModel( config=bert_config, is_training=is_training, input_ids=masked_input_ids, input_mask=repeat_input_mask, token_type_ids=repeat_segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) (masked_lm_loss, masked_lm_example_loss2, masked_lm_log_probs2) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) n_mask = train_config.max_predictions_per_seq def reform(t): t = tf.reshape(t, [n_trial, -1, n_mask]) t = tf.transpose(t, [1, 0, 2]) return t grouped_positions = reform(masked_lm_positions) grouped_loss1 = reform(masked_lm_example_loss1) grouped_loss2 = reform(masked_lm_example_loss2) tvars = tf.compat.v1.trainable_variables() scaffold_fn = None initialized_variable_names, init_fn = get_init_fn_for_two_checkpoints( train_config, tvars, train_config.init_checkpoint, prefix1, train_config.second_init_checkpoint, prefix2) if train_config.use_tpu: def tpu_scaffold(): init_fn() return tf.compat.v1.train.Scaffold() scaffold_fn = tpu_scaffold else: init_fn() log_var_assignments(tvars, initialized_variable_names) output_spec = None if mode == tf.estimator.ModeKeys.PREDICT: predictions = { "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, "grouped_positions": grouped_positions, "grouped_loss1": grouped_loss1, "grouped_loss2": grouped_loss2, } output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=None, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, features=None, scope=None): super(DualBertTwoInputModelEx, self).__init__() input_ids2 = features["input_ids2"] input_mask2 = features["input_mask2"] segment_ids2 = features["segment_ids2"] modeling_option = config.model_option with tf.compat.v1.variable_scope(dual_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(dual_model_prefix2): model_2 = BertModel( config=config, is_training=is_training, input_ids=input_ids2, input_mask=input_mask2, token_type_ids=segment_ids2, use_one_hot_embeddings=use_one_hot_embeddings, ) model_1_first_token = model_1.get_sequence_output()[:, 0, :] model_2_first_token = model_2.get_sequence_output()[:, 0, :] print('model_2_first_token', model_2_first_token) mask_scalar = { "0": 0., "1": 1., "random": tf.random.uniform(shape=[], minval=0., maxval=1.) }[modeling_option] print("Mask_scalar:", mask_scalar) model_2_first_token = mask_scalar * model_2_first_token print('model_2_first_token', model_2_first_token) rep = tf.concat([model_1_first_token, model_2_first_token], axis=1) self.sequence_output = tf.concat( [model_1.get_sequence_output(), model_2.get_sequence_output()], axis=2) dense_layer = tf.keras.layers.Dense( config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) pooled_output = dense_layer(rep) self.pooled_output = pooled_output
def __init__(self, config, is_training, use_one_hot_embeddings=True, features=None, scope=None): super(MES_sel, self).__init__() input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] unit_length = config.max_seq_length d_seq_length = config.max_d_seq_length num_window = int(d_seq_length / unit_length) batch_size, _ = get_shape_list2(input_ids) # [Batch, num_window, unit_seq_length] stacked_input_ids, stacked_input_mask, stacked_segment_ids = split_input( input_ids, input_mask, segment_ids, d_seq_length, unit_length) with tf.compat.v1.variable_scope(dual_model_prefix1): model = BertModel( config=config, is_training=is_training, input_ids=r3to2(stacked_input_ids), input_mask=r3to2(stacked_input_mask), token_type_ids=r3to2(stacked_segment_ids), use_one_hot_embeddings=use_one_hot_embeddings, ) def r2to3(arr): return tf.reshape(arr, [batch_size, num_window, -1]) # [Batch, num_window, window_length, hidden_size] pooled = model.get_pooled_output() logits_2d = tf.keras.layers.Dense(2, name="cls_dense")(pooled) # logits_3d = r2to3(logits_2d) label_ids_repeat = tf.tile(label_ids, [1, num_window]) loss_arr = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits_3d, labels=label_ids_repeat) layer1_loss = tf.reduce_mean(loss_arr) probs = tf.nn.softmax(logits_3d)[:, :, 1] # [batch_size, num_window] # Probabilistic selection def select_seg(stacked_input_ids, indices): # indices : [batch_size, 1] return tf.gather(stacked_input_ids, indices, axis=1, batch_dims=1) max_seg = tf.argmax(probs, axis=1) input_ids = select_seg(stacked_input_ids, max_seg) input_mask = select_seg(stacked_input_mask, max_seg) segment_ids = select_seg(stacked_segment_ids, max_seg) with tf.compat.v1.variable_scope(dual_model_prefix2): model = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) logits = tf.keras.layers.Dense(2, name="cls_dense")( model.get_pooled_output()) self.logits = logits label_ids = tf.reshape(label_ids, [-1]) loss_arr = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=label_ids) layer2_loss = tf.reduce_mean(loss_arr) alpha = 0.1 loss = alpha * layer1_loss + layer2_loss self.loss = loss
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, features=None, scope=None): super(TripleBertWeighted, self).__init__() input_ids2 = features["input_ids2"] input_mask2 = features["input_mask2"] segment_ids2 = features["segment_ids2"] input_ids3 = features["input_ids3"] input_mask3 = features["input_mask3"] segment_ids3 = features["segment_ids3"] def apply_binary_dense(vector): output = tf.keras.layers.Dense( 2, activation=tf.keras.activations.softmax, name="cls_dense", kernel_initializer=create_initializer( config.initializer_range))(vector) return output with tf.compat.v1.variable_scope(triple_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) model_1_pred = tf.keras.layers.Dense( 3, activation=tf.keras.activations.softmax, name="cls_dense", kernel_initializer=create_initializer( config.initializer_range))(model_1.get_pooled_output()) model_1_pred = model_1_pred[:, :2] with tf.compat.v1.variable_scope(triple_model_prefix2): model_2 = BertModel( config=config, is_training=is_training, input_ids=input_ids2, input_mask=input_mask2, token_type_ids=segment_ids2, use_one_hot_embeddings=use_one_hot_embeddings, ) model_2_pred = apply_binary_dense(model_2.get_pooled_output()) with tf.compat.v1.variable_scope(triple_model_prefix3): model_3 = BertModel( config=config, is_training=is_training, input_ids=input_ids3, input_mask=input_mask3, token_type_ids=segment_ids3, use_one_hot_embeddings=use_one_hot_embeddings, ) model_3_pred = apply_binary_dense(model_3.get_pooled_output()) # Option : initialize dense combined_pred = model_1_pred * model_3_pred[:, 0:1] \ + model_2_pred * model_3_pred[:, 1:2] self.rel_score = model_3_pred[:, 1:2] self.pooled_output = combined_pred
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, features=None, scope=None): super(TripleBertMasking, self).__init__() input_ids2 = features["input_ids2"] input_mask2 = features["input_mask2"] segment_ids2 = features["segment_ids2"] input_ids3 = features["input_ids3"] input_mask3 = features["input_mask3"] segment_ids3 = features["segment_ids3"] with tf.compat.v1.variable_scope(triple_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(triple_model_prefix2): model_2 = BertModel( config=config, is_training=is_training, input_ids=input_ids2, input_mask=input_mask2, token_type_ids=segment_ids2, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(triple_model_prefix3): model_3 = BertModel( config=config, is_training=is_training, input_ids=input_ids3, input_mask=input_mask3, token_type_ids=segment_ids3, use_one_hot_embeddings=use_one_hot_embeddings, ) model_1_first_token = model_1.get_sequence_output()[:, 0, :] model_2_first_token = model_2.get_sequence_output()[:, 0, :] pooled3 = model_3.get_pooled_output() probs3 = tf.keras.layers.Dense(2, activation=tf.keras.activations.softmax, kernel_initializer=create_initializer( config.initializer_range))(pooled3) mask_scalar = probs3[:, 1:2] self.rel_score = mask_scalar model_2_first_token = mask_scalar * model_2_first_token rep = tf.concat([model_1_first_token, model_2_first_token], axis=1) self.sequence_output = tf.concat( [model_1.get_sequence_output(), model_2.get_sequence_output()], axis=2) dense_layer = tf.keras.layers.Dense( config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) pooled_output = dense_layer(rep) self.pooled_output = pooled_output
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf_logging.info("*** Features ***") for name in sorted(features.keys()): tf_logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] if mode == tf.estimator.ModeKeys.PREDICT: label_ids = tf.ones([input_ids.shape[0]], dtype=tf.int32) else: label_ids = features["label_ids"] label_ids = tf.reshape(label_ids, [-1]) if "is_real_example" in features: is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) else: is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) is_training = (mode == tf.estimator.ModeKeys.TRAIN) input_ids2 = features["input_ids2"] input_mask2 = features["input_mask2"] segment_ids2 = features["segment_ids2"] with tf.compat.v1.variable_scope(dual_model_prefix1): model_1 = BertModel( config=model_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) pooled = model_1.get_pooled_output() if is_training: pooled = dropout(pooled, 0.1) logits = tf.keras.layers.Dense(train_config.num_classes, name="cls_dense")(pooled) with tf.compat.v1.variable_scope(dual_model_prefix2): model_2 = BertModel( config=model_config, is_training=is_training, input_ids=input_ids2, input_mask=input_mask2, token_type_ids=segment_ids2, use_one_hot_embeddings=train_config.use_one_hot_embeddings, ) pooled = model_2.get_pooled_output() if is_training: pooled = dropout(pooled, 0.1) conf_probs = tf.keras.layers.Dense( train_config.num_classes, name="cls_dense", activation=tf.keras.activations.softmax)(pooled) confidence = conf_probs[:, 1] confidence_loss = 1 - confidence cls_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=label_ids) k = model_config.k alpha = model_config.alpha loss_arr = cls_loss * confidence + confidence_loss * k loss_arr = apply_weighted_loss(loss_arr, label_ids, alpha) loss = tf.reduce_mean(input_tensor=loss_arr) tvars = tf.compat.v1.trainable_variables() initialized_variable_names = {} scaffold_fn = None if train_config.init_checkpoint: initialized_variable_names, init_fn = get_init_fn( train_config, tvars) scaffold_fn = get_tpu_scaffold_or_init(init_fn, train_config.use_tpu) log_var_assignments(tvars, initialized_variable_names) TPUEstimatorSpec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec def metric_fn(log_probs, label, is_real_example, confidence): r = classification_metric_fn(log_probs, label, is_real_example) r['confidence'] = tf.compat.v1.metrics.mean(confidence) return r output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: tvars = None train_op = optimization.create_optimizer_from_config( loss, train_config, tvars) output_spec = TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: eval_metrics = (metric_fn, [logits, label_ids, is_real_example, confidence]) output_spec = TPUEstimatorSpec(mode=mode, loss=loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: predictions = { "input_ids": input_ids, "logits": logits, "confidence": confidence, } if "data_id" in features: predictions['data_id'] = features['data_id'] output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec