def __init__(self, config: bert.BertConfig, is_training, num_units, inputs, segments, inputs_length=None, answers=None, answers_length=None, layers=None): if inputs_length is None: inputs_mask = tf.ones(tf.shape(inputs), dtype=tf.bool) else: inputs_mask = tf.sequence_mask(inputs_length) if answers is None: assert answers_length is None if layers is None: layers = [config.num_hidden_layers - 1] self.layers = layers self.bertmodel = bert.BertModel(config=config, is_training=is_training, input_ids=inputs, input_mask=inputs_mask, token_type_ids=segments, scope='bert') self._num_units = num_units outputs = self.bertmodel.get_all_encoder_layers() self.outputs = {i: outputs[i] for i in self.layers} #with tf.device("cpu:0"): self.answers_finders = {i: AnswerFinder(num_units, name="answer_layer_" + str(i)) for i in self.layers} self._logprobs = [] for i, answer_finder in self.answers_finders.items(): x = answer_finder(inputs=self.outputs[i], mask=tf.cast(segments, tf.bool)) self._logprobs.append((i, x)) self._logprobs = dict(self._logprobs) self._predicts = {i: predict(logprob) for i, logprob in self._logprobs.items()} if answers is not None: if answers_length is None: answers_mask = tf.ones(tf.shape(answers)[0:2], dtype=tf.bool) else: answers_mask = tf.sequence_mask(answers_length) self._losses = {i: loss(self._logprobs[i], answers, answers_mask) for i in self.layers} self._accuracy = {i: accuracy(self._predicts[i], answers, answers_mask) for i in self.layers}
def _f(): model_2 = bert.BertModel(config=config, trainable=True, name=name) inputs = tf.placeholder(shape=[None, None], dtype=tf.int32) mask = tf.placeholder(shape=[None, None], dtype=tf.int32) y = model_2(inputs, input_mask=mask) assigns = [] variables = model_2.variables transformer_variables = sorted(zip( (var.name.lower() for var in variables), variables), key=lambda t: t[0]) off_bert_pairs = sorted(zip((var.name.lower() for var in gb), official_bert_variables), key=lambda t: t[0]) for i in range(len(transformer_variables)): assigns.append( tf.assign(transformer_variables[i][1], off_bert_pairs[i][1])) return model_2, assigns
def _disambiguation_layer(self, seqs): with tf.variable_scope('disambiguation'): word_embeddings = self._make_word_embeddings(seqs) model = bert.BertModel(self._disambiguation_bert_config, self._training, word_embeddings, self._padding) # (batch_size, sentence_len, embedding_size) reps = model.get_output() # (batch_size, sentence_len, n_senses) sense_probs = self._calculate_sense_probs(seqs, reps) # (batch_size, sentence_len, embedding_size) disambiguated_reps = self._make_word_embeddings( seqs, sense_weights=sense_probs) return disambiguated_reps, sense_probs
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model = bert.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) # with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.softmax(logits, axis=-1) predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return loss, per_example_loss, logits, probabilities, predictions
def _f(): config.num_hidden_layers = len(corresponding_blocks) model_2 = bert.BertModel(config=config, trainable=True, name=name) inputs = tf.placeholder(shape=[None, None], dtype=tf.int32) mask = tf.placeholder(shape=[None, None], dtype=tf.int32) y = model_2(inputs, input_mask=mask) assigns = [] variables = model_2.variables def atoi(text): return int(text) if text.isdigit() else text transformer_variables = sorted( zip((var.name.lower() for var in variables), variables), key=lambda t: [atoi(c) for c in re.split(r'(\d+)', t[0])]) off_bert_pairs = sorted( zip((var.name.lower() for var in gb), official_bert_variables), key=lambda t: [atoi(c) for c in re.split(r'(\d+)', t[0])]) embedding_variables = 5 layer_variables = 16 pooling_variables = 2 off_bert_pairs_by_block = [off_bert_pairs[0:pooling_variables]] for j in range(num_blocks): off_bert_pairs_by_block += [ off_bert_pairs[pooling_variables + j * layer_variables:pooling_variables + (j + 1) * layer_variables] ] off_bert_pairs_by_block += [off_bert_pairs[-embedding_variables:]] transformer_variables_by_block = [ transformer_variables[0:pooling_variables] ] for j in range(len(corresponding_blocks)): transformer_variables_by_block += [ transformer_variables[pooling_variables + j * layer_variables:pooling_variables + (j + 1) * layer_variables] ] transformer_variables_by_block += [ transformer_variables[-embedding_variables:] ] for j in range(len(corresponding_blocks) + 2): if j == 0: for k in range(pooling_variables): assigns.append( tf.assign(transformer_variables_by_block[j][k][1], off_bert_pairs_by_block[j][k][1])) elif j == len(corresponding_blocks) + 2 - 1: for k in range(embedding_variables): assigns.append( tf.assign(transformer_variables_by_block[j][k][1], off_bert_pairs_by_block[j][k][1])) else: for k in range(layer_variables): assigns.append( tf.assign( transformer_variables_by_block[j][k][1], off_bert_pairs_by_block[ corresponding_blocks[j]][k][1])) return model_2, assigns
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) # MTF setup. graph = mtf.Graph() # mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape) # layout_rules = mtf.convert_to_layout_rules(FLAGS.layout) if FLAGS.mode == "auto_parallel": mesh_shape_map = { 1: [("processor_rows", 1)], 2: [("processor_rows", 2)], 4: [("processor_rows", 2), ("processor_cols", 2)], 8: [("processor_rows", 2), ("processor_cols", 4)] } elif FLAGS.mode == "data_parallel": mesh_shape_map = { 1: [("processor_rows", 1)], 2: [("processor_rows", 2)], 4: [("processor_rows", 4)], 8: [("processor_rows", 8)] } else: raise ValueError mesh_shape = mesh_shape_map[FLAGS.gpu_num] devices = [f"gpu:{i}" for i in range(FLAGS.gpu_num)] var_placer = None mesh = mtf.Mesh(graph, "bert_mesh", var_placer) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] next_sentence_labels = tf.squeeze(features["next_sentence_labels"], 1) batch_size = input_ids.get_shape()[0].value batch_dim = mtf.Dimension("batch", batch_size) seq_length = input_ids.get_shape()[1].value seq_dim = mtf.Dimension("seq", seq_length) max_predictions_per_seq = masked_lm_positions.get_shape()[1].value max_predictions_per_seq_dim = mtf.Dimension("max_pred_seq", max_predictions_per_seq) mtf_input_ids = mtf.import_tf_tensor(mesh, input_ids, [batch_dim, seq_dim]) mtf_input_mask = mtf.import_tf_tensor(mesh, input_mask, [batch_dim, seq_dim]) mtf_segment_ids = mtf.import_tf_tensor(mesh, segment_ids, [batch_dim, seq_dim]) mtf_masked_lm_positions = mtf.import_tf_tensor( mesh, masked_lm_positions, [batch_dim, max_predictions_per_seq_dim]) mtf_masked_lm_ids = mtf.import_tf_tensor( mesh, masked_lm_ids, [batch_dim, max_predictions_per_seq_dim]) mtf_masked_lm_weights = mtf.import_tf_tensor( mesh, masked_lm_weights, [batch_dim, max_predictions_per_seq_dim]) mtf_next_sentence_labels = mtf.import_tf_tensor( mesh, next_sentence_labels, [batch_dim]) is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = bert_lib.BertModel(config=bert_config, is_training=is_training, input_ids=mtf_input_ids, input_mask=mtf_input_mask, token_type_ids=mtf_segment_ids, mesh_shape=mesh_shape) (masked_lm_loss, masked_lm_example_loss, masked_lm_logits) = model.get_masked_lm_output( mtf_masked_lm_positions, mtf_masked_lm_ids, mtf_masked_lm_weights) (next_sentence_loss, next_sentence_example_loss, next_sentence_logits ) = model.get_next_sentence_output(mtf_next_sentence_labels) extra_loss = model.get_extra_loss() total_loss = masked_lm_loss + next_sentence_loss total_loss = mtf.anonymize(total_loss) masked_lm_example_loss = mtf.anonymize(masked_lm_example_loss) masked_lm_logits = mtf.anonymize(masked_lm_logits) next_sentence_example_loss = mtf.anonymize(next_sentence_example_loss) next_sentence_logits = mtf.anonymize(next_sentence_logits) outputs = [total_loss] if FLAGS.mode == "auto_parallel": layout_rules = mtf.auto_mtf.layout(graph, mesh_shape, outputs) elif FLAGS.mode == "data_parallel": layout_rules = [('batch', 'processor_rows')] else: raise ValueError variables = graph._all_variables for v in variables: tf.logging.info( "[parameter] (name,shape,dtype): ({},{},{})".format( v.name, v.shape, v.dtype.master_dtype)) tf.logging.info("layout rules: {}".format(layout_rules)) mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl( mesh_shape, layout_rules, devices) # TRAIN mode if mode == tf.estimator.ModeKeys.TRAIN: _, update_ops = optimization_lib.create_optimizer( total_loss + extra_loss, learning_rate, num_train_steps, num_warmup_steps, optimizer=FLAGS.optimizer, clip_gradients=FLAGS.clip_gradients) lowering = mtf.Lowering(graph, {mesh: mesh_impl}) tf_loss = tf.to_float(lowering.export_to_tf_tensor(total_loss)) if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_global_step() tf_update_ops = [ lowering.lowered_operation(op) for op in update_ops ] tf_update_ops.append(tf.assign_add(global_step, 1)) tf.logging.info("tf_update_ops: {}".format(tf_update_ops)) train_op = tf.group(tf_update_ops) with mtf.utils.outside_all_rewrites(): # Copy master variables to slices. Must be called first. restore_hook = mtf.MtfRestoreHook(lowering) if mode == tf.estimator.ModeKeys.TRAIN: saver = tf.train.Saver(tf.global_variables(), sharded=True, max_to_keep=10, keep_checkpoint_every_n_hours=2, defer_build=False, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) # saver_listener = mtf.MtfCheckpointSaverListener(lowering) # saver_hook = tf.train.CheckpointSaverHook( # FLAGS.output_dir, # save_steps=1000, # saver=saver, # listeners=[saver_listener]) return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.TRAIN, loss=tf_loss, train_op=train_op, training_hooks=[restore_hook])
print("------------------------------------") print("Prepared data...") print("Number of labels: ", len(labels)) print("Number of training examples: ", len(training_examples)) print("Number of training steps: ", num_training_steps) print("Number of evaluation examples: ", len(evaluation_examples)) print("Number of evaluation steps: ", num_evaluation_steps) print("Number of test examples: ", len(test_examples)) print("Number of test steps: ", num_test_steps) # Define BERT Model print("------------------------------------") print("Define BERT Model...") model = bert.BertModel(config=bert_config, is_training=True, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False) # In the demo, we are doing a simple classification task on the entire segment. # If you want to use the token-level output, use model.get_sequence_output() instead. output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [len(labels), hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [len(labels)], initializer=tf.zeros_initializer())
def _prediction_layer(self, reps): with tf.variable_scope('prediction'): model = bert.BertModel(self._prediction_bert_config, self._training, reps, self._padding) return model.get_output()
def create_model(bert_config, is_training, use_pcnn, input_ids, input_mask, head_ids, tail_ids, num_labels, use_one_hot_embeddings, segment_mask, position1, position2): model = bert.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, head_ids=head_ids, tail_ids=tail_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_sequence_output() head_embedding, neg_head_embedding = model.get_head_embedding() tail_embedding, neg_tail_embedding = model.get_tail_embedding() pos_embedding = network.pos_embedding( position1, position2, pos_embedding_dim=FLAGS.pos_embedding_dim, max_length=FLAGS.max_seq_length) output_layer = tf.concat([output_layer, pos_embedding], -1) # [batch_size, hidden_size] sentence_embedding = tf.layers.conv1d( inputs=output_layer, filters=bert_config.hidden_size, kernel_size=3, strides=1, padding="same", kernel_initializer=tf.contrib.layers.xavier_initializer()) if use_pcnn: mask_embedding = tf.constant( [[0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.float32) mask = tf.nn.embedding_lookup(mask_embedding, segment_mask) sentence_embedding = tf.reduce_max(tf.expand_dims( mask * 100, 2) + tf.expand_dims(sentence_embedding, 3), axis=1) - 100 return tf.reshape(sentence_embedding, [-1, bert_config.hidden_size * 3]) else: sentence_embedding = tf.reduce_max(sentence_embedding, axis=-2) # sentence_embedding = network.encoder(output_layer, segment_mask, bert_config.hidden_size, use_pcnn) output_weights = tf.get_variable( "output_weights", [num_labels, bert_config.hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: sentence_embedding = tf.nn.dropout(sentence_embedding, keep_prob=0.9) positive = tf.add(head_embedding, tail_embedding) positive = abs(tf.add(positive, -sentence_embedding)) positive = tf.reduce_sum(positive, axis=1, keep_dims=True) negative = tf.add(neg_head_embedding, neg_tail_embedding) negative = abs(tf.add(negative, -sentence_embedding)) negative = tf.reduce_sum(negative, axis=1, keep_dims=True) per_trans_loss = tf.maximum(positive - negative + FLAGS.margin, 0) total_trans_loss = tf.reduce_mean(per_trans_loss) logits = tf.matmul(sentence_embedding, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.softmax(logits) return per_trans_loss, total_trans_loss, logits, probabilities
import tensorflow as tf import bert import HP if __name__ == "__main__": bert_config = bert.BertConfig.from_json_file(HP.bert_config) x = tf.placeholder(dtype=tf.float32, shape=[1, 1]) model = bert.BertModel(bert_config, False, x, scope='bert') config = tf.ConfigProto(device_count={'GPU': 0}) sess = tf.Session(config=config) saver1 = tf.train.Saver() saver2 = tf.train.Saver({v.name: v for v in tf.global_variables()}) saver1.restore(sess, HP.start1_checkpoint) saver2.save(sess, HP.start1_checkpoint)