def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable( "cls/squad/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) return (start_logits, end_logits)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_sequence_output() hidden_size = output_layer.shape[-1].value output_weight = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weight, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, 11]) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_sum(per_example_loss) probabilities = tf.nn.softmax(logits, axis=-1) predict = tf.argmax(probabilities, axis=-1) return (loss, per_example_loss, logits, predict)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings, pos_weight=None): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, probabilities)
def init(max_sequence_length, bert_config_file, model_path, vocab_file): sess = tf.Session() tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) bert_config = modeling.BertConfig.from_json_file(bert_config_file) input_ids = tf.placeholder(tf.int32, shape=[None, max_sequence_length], name='input_ids') input_mask = tf.placeholder(tf.int32, shape=[None, max_sequence_length], name='input_mask') segment_ids = tf.placeholder(tf.int32, shape=[None, max_sequence_length], name='segment_ids') with sess.as_default(): model = modeling.BertModel( config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False) output_layer = model.get_pooled_output() with tf.variable_scope("cls/seq_relationship"): output_weights = tf.get_variable( "output_weights", shape=[2, bert_config.hidden_size], initializer=modeling.create_initializer(bert_config.initializer_range)) output_bias = tf.get_variable( "output_bias", shape=[2], initializer=tf.zeros_initializer()) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probs = tf.nn.softmax(logits, axis=-1, name='probs') saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) saver.restore(sess, model_path) return sess, tokenizer
def model_fn(features, labels, mode, params): unique_id = features["unique_id"] input_ids = features["input_ids"] input_mask = features["input_mask"] input_type_ids = features["input_type_ids"] tokens = features["tokens"] model = modeling.BertModel(config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=input_type_ids, use_one_hot_embeddings=False) if mode != tf.estimator.ModeKeys.PREDICT: raise ValueError("Only PREDICT modes are supported: %s" % (mode)) tvars = tf.trainable_variables() scaffold_fn = None (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) all_layers = model.get_all_encoder_layers() predictions = { "unique_id": unique_id, "tokens": tokens, } for (i, layer_index) in enumerate(layer_indexes): predictions["layer_output_%d" % i] = all_layers[layer_index] output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def Convert2BertPb(args): output_op_names = args.output_ops.split(',') pathname = os.path.join(args.dir, "bert_model.ckpt") # 模型地址 bert_config = modeling.BertConfig.from_json_file( os.path.join(args.dir, "bert_config.json")) # 配置文件地址 configsession = tf.ConfigProto() configsession.gpu_options.allow_growth = True sess = tf.Session(config=configsession) input_ids = tf.placeholder(shape=[None, args.seq_len], dtype=tf.int32, name="input_ids") input_mask = tf.placeholder(shape=[None, args.seq_len], dtype=tf.int32, name="input_mask") segment_ids = tf.placeholder(shape=[None, args.seq_len], dtype=tf.int32, name="segment_ids") with sess.as_default(): model = modeling.BertModel(config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=True) saver = tf.train.Saver() # 这里尤其注意,先初始化,在加载参数,否者会把bert的参数重新初始化。这里和demo1是有区别的 sess.run(tf.global_variables_initializer()) saver.restore(sess, pathname) # frozen_graph = freeze_session(sess, output_names=['bert/encoder/Reshape_3']) frozen_graph = freeze_session(sess, output_names=output_op_names) # Save tf.train.write_graph(frozen_graph, ".", args.out_file, as_text=False)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, num_tags, osentences_len): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. # output_layer = model.get_pooled_output() output_layer = model.get_sequence_output() with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) _, sentence_len, _ = output_layer.shape.as_list() # Ignore the [cls] token in the head of the sentence. output_layer = output_layer[:, 1:, :] # FC layer logits = tf.layers.dense(output_layer, num_tags) # crf layer crf_params = tf.get_variable(name='crf', shape=[num_tags, num_tags], dtype=tf.float32) pred_id, _ = tf.contrib.crf.crf_decode(logits, crf_params, osentences_len) return logits, crf_params, pred_id, sentence_len
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # Classification task on the entire segment. output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.sigmoid(logits) sigmoid_cross_entropy_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.cast(labels, tf.float32), logits=logits) per_example_loss = tf.reduce_sum(sigmoid_cross_entropy_loss, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, probabilities)
def create_model(bert_config, is_training, input_ids, mask, segment_ids, labels, num_labels, use_one_hot_embeddings): model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_sequence_output() ''' output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.contrib.layers.xavier_initializer) ''' #output_layer shape is if is_training: output_layer = tf.keras.layers.Dropout(rate=0.1)(output_layer) logits = hidden2tag(output_layer, num_labels) # TODO test shape logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, num_labels]) ''' logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) ''' if FLAGS.crf: mask2len = tf.reduce_sum(mask, axis=1) loss, trans = crf_loss(logits, labels, mask, num_labels, mask2len) predict, viterbi_score = tf.contrib.crf.crf_decode( logits, trans, mask2len) return (loss, logits, predict) else: loss, predict = softmax_layer(logits, labels, num_labels, mask) return (loss, logits, predict)
input_ids = tokenizer.convert_tokens_to_ids(tokens) def create_int_feature(values): feature = tf.train.Feature( int64_list=tf.train.Int64List(value=list(values))) return feature print(input_ids) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) model = modeling.BertModel( config=bert_config,#たぶんこの設定にしたがってbertを呼び出すということ is_training=False, input_ids=tf.constant([input_ids,input_ids]), input_mask=tf.constant([[0,0,0,0,0],[0,0,0,0,0]]), token_type_ids=tf.constant([[0,0,0,0,0],[0,0,0,0,0]]), use_one_hot_embeddings=True) final_hidden = model.get_sequence_output() eout = model.get_embedding_output() pout = model.get_pooled_output() etab = model.get_embedding_table() init = tf.initialize_all_variables() with tf.Session() as sess: sess.run(init) #[動く]result = sess.run(final_hidden)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" ########## 各个参数的值 ########## features: {'input_ids': <tf.Tensor 'IteratorGetNext:0' shape=(8, 128) dtype=int32>, ########## 'input_mask': <tf.Tensor 'IteratorGetNext:1' shape=(8, 128) dtype=int32>, ########## 'masked_lm_ids': <tf.Tensor 'IteratorGetNext:2' shape=(8, 20) dtype=int32>, ########## 'masked_lm_positions': <tf.Tensor 'IteratorGetNext:3' shape=(8, 20) dtype=int32>, ########## 'masked_lm_weights': <tf.Tensor 'IteratorGetNext:4' shape=(8, 20) dtype=float32>, ########## 'next_sentence_labels': <tf.Tensor 'IteratorGetNext:5' shape=(8, 1) dtype=int32>, ########## 'segment_ids': <tf.Tensor 'IteratorGetNext:6' shape=(8, 128) dtype=int32>} ########## labels: None ########## mode: eval ########## params {'batch_size': 8, 'use_tpu': False} tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) #input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] next_sentence_labels = features["next_sentence_labels"] hist_len = features["hist_len"] input_ids = features["input_ids"] next_item = features['next_item'] is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, hist_len=hist_len, next_item=next_item) (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) #3/1,更改 (next_sentence_loss, next_sentence_example_loss, next_sentence_log_probs) = get_next_item_output( bert_config, model.get_decoder_layer(), next_sentence_labels, hist_len) total_loss = masked_lm_loss tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assigment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape( masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) next_sentence_log_probs = tf.reshape( next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) next_sentence_predictions = tf.argmax(next_sentence_log_probs, axis=-1, output_type=tf.int32) next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) next_sentence_accuracy = tf.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions) next_sentence_mean_loss = tf.metrics.mean( values=next_sentence_example_loss) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, "next_sentence_accuracy": next_sentence_accuracy, "next_sentence_loss": next_sentence_mean_loss, } eval_metrics = (metric_fn, [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels ]) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] word_ids = features["word_ids"] mention_ids = features["mention_id"] random_mask = tf.random_uniform(input_ids.shape) masked_lm_positions = tf.cast(random_mask < FLAGS.mask_lm_rate, tf.int32) masked_lm_positions *= word_ids masked_lm_input_ids = masked_lm_positions * FLAGS.mask_word_id + ( 1 - masked_lm_positions) * input_ids masked_lm_weights = masked_lm_positions masked_lm_ids = input_ids is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=masked_lm_input_ids, input_mask=input_mask, token_type_ids=segment_ids, mention_ids=mention_ids, use_one_hot_embeddings=use_one_hot_embeddings) (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, input_ids, masked_lm_weights) total_loss = masked_lm_loss tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = bert.modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) # if use_tpu: # def tpu_scaffold(): # tf.train.init_from_checkpoint(init_checkpoint, assignment_map) # return tf.train.Scaffold() # scaffold_fn = tpu_scaffold # else: # tf.train.init_from_checkpoint(init_checkpoint, assignment_map) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) if FLAGS.max_seq_length > 1024: pass tf.train.init_from_checkpoint( init_checkpoint, { "bert/embeddings/position_embeddings": "bert/embeddings/position_embeddings_first" }) tf.train.init_from_checkpoint( init_checkpoint, { "bert/embeddings/position_embeddings": "bert/embeddings/position_embeddings_second" }) tf.train.init_from_checkpoint( init_checkpoint, { "bert/embeddings/position_embeddings": "bert/embeddings/position_embeddings_third" }) elif FLAGS.max_seq_length > 512: pass tf.train.init_from_checkpoint( init_checkpoint, { "bert/embeddings/position_embeddings": "bert/embeddings/position_embeddings_former" }) tf.train.init_from_checkpoint( init_checkpoint, { "bert/embeddings/position_embeddings": "bert/embeddings/position_embeddings_latter" }) else: pass tf.train.init_from_checkpoint( init_checkpoint, { "bert/embeddings/position_embeddings": "bert/embeddings/position_embeddings" }) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) if FLAGS.max_seq_length > 1024: pass tf.train.init_from_checkpoint( init_checkpoint, { "bert/embeddings/position_embeddings": "bert/embeddings/position_embeddings_first" }) tf.train.init_from_checkpoint( init_checkpoint, { "bert/embeddings/position_embeddings": "bert/embeddings/position_embeddings_second" }) tf.train.init_from_checkpoint( init_checkpoint, { "bert/embeddings/position_embeddings": "bert/embeddings/position_embeddings_third" }) if FLAGS.max_seq_length > 512: pass tf.train.init_from_checkpoint( init_checkpoint, { "bert/embeddings/position_embeddings": "bert/embeddings/position_embeddings_former" }) tf.train.init_from_checkpoint( init_checkpoint, { "bert/embeddings/position_embeddings": "bert/embeddings/position_embeddings_latter" }) else: pass tf.train.init_from_checkpoint( init_checkpoint, { "bert/embeddings/position_embeddings": "bert/embeddings/position_embeddings" }) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape( masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, } eval_metrics = (metric_fn, [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights ]) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) return output_spec
def main(): processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_print_test: raise ValueError("At least one of `do_train` or `do_eval` " "or `do_print_test` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) if not os.path.isdir(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) train_examples = None num_train_steps = None num_warmup_steps = None # TODO: use special Adam from "optimization.py" if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) bert = modeling.BertModel(config=bert_config) model = modeling.BertClassifier(bert, num_labels=len(label_list)) chainer.serializers.load_npz(FLAGS.init_checkpoint, model, ignore_names=['output/W', 'output/b']) if FLAGS.gpu >= 0: chainer.backends.cuda.get_device_from_id(FLAGS.gpu).use() model.to_gpu() if FLAGS.do_train: # TODO: use special Adam from "optimization.py" optimizer = chainer.optimizers.Adam(alpha=FLAGS.learning_rate) optimizer.setup(model) train_iter = chainer.iterators.SerialIterator(train_examples, FLAGS.train_batch_size) converter = Converter(label_list, FLAGS.max_seq_length, tokenizer) updater = training.updaters.StandardUpdater(train_iter, optimizer, converter=converter, device=FLAGS.gpu) trainer = training.Trainer(updater, (num_train_steps, 'iteration'), out=FLAGS.output_dir) trainer.extend(extensions.snapshot(), trigger=(num_train_steps, 'iteration')) trainer.extend(extensions.LogReport(trigger=(50, 'iteration'))) trainer.extend( extensions.PrintReport( ['iteration', 'main/loss', 'main/accuracy', 'elapsed_time'])) trainer.extend(extensions.ProgressBar(update_interval=10)) trainer.run() if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) test_iter = chainer.iterators.SerialIterator(eval_examples, FLAGS.train_batch_size * 2, repeat=False, shuffle=False) converter = Converter(label_list, FLAGS.max_seq_length, tokenizer) evaluator = extensions.Evaluator(test_iter, model, converter=converter, device=FLAGS.gpu) results = evaluator() print(results) # if you wanna see some output arrays for debugging if FLAGS.do_print_test: short_eval_examples = processor.get_dev_examples(FLAGS.data_dir)[:3] short_eval_examples = short_eval_examples[:FLAGS.eval_batch_size] short_test_iter = chainer.iterators.SerialIterator( short_eval_examples, FLAGS.eval_batch_size, repeat=False, shuffle=False) converter = Converter(label_list, FLAGS.max_seq_length, tokenizer) evaluator = extensions.Evaluator(test_iter, model, converter=converter, device=FLAGS.gpu) with chainer.using_config('train', False): with chainer.no_backprop_mode(): data = short_test_iter.__next__() out = model.bert.get_pooled_output( *converter(data, FLAGS.gpu)[:-1]) print(out) print(out.shape) print(converter(data, -1))
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info('*** Features ***') for name in sorted(features.keys()): tf.logging.info(' name = %s, shape = %s' % (name, features[name].shape)) input_ids = features['input_ids'] input_mask = features['input_mask'] segment_ids = features['segment_ids'] masked_lm_positions = features['masked_lm_positions'] masked_lm_ids = features['masked_lm_ids'] masked_lm_weights = features['masked_lm_weights'] next_sentence_labels = features['next_sentence_labels'] is_training = mode == tf.estimator.ModeKeys.TRAIN model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) ( masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs, ) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights, ) ( next_sentence_loss, next_sentence_example_loss, next_sentence_log_probs, ) = get_next_sentence_output(bert_config, model.get_pooled_output(), next_sentence_labels) total_loss = masked_lm_loss + next_sentence_loss tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: ( assignment_map, initialized_variable_names, ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info('**** Trainable Variables ****') for var in tvars: init_string = '' if var.name in initialized_variable_names: init_string = ', *INIT_FROM_CKPT*' tf.logging.info(' name = %s, shape = %s%s', var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: # train_op = custom_optimization.create_optimizer( # total_loss, learning_rate, num_train_steps, num_warmup_steps # ) train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, ) else: train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, ) if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold=scaffold_fn, ) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn, ) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn( masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels, ): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape( masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights, ) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) next_sentence_log_probs = tf.reshape( next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]], ) next_sentence_predictions = tf.argmax(next_sentence_log_probs, axis=-1, output_type=tf.int32) next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) next_sentence_accuracy = tf.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions, ) next_sentence_mean_loss = tf.metrics.mean( values=next_sentence_example_loss) return { 'masked_lm_accuracy': masked_lm_accuracy, 'masked_lm_loss': masked_lm_mean_loss, 'next_sentence_accuracy': next_sentence_accuracy, 'next_sentence_loss': next_sentence_mean_loss, } eval_metrics = ( metric_fn, [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels, ], ) if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold=scaffold_fn, ) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn, ) else: raise ValueError('Only TRAIN and EVAL modes are supported: %s' % (mode)) return output_spec
def optimize_graph(logger=None, verbose=False): if not logger: logger = set_logger(colored('BERT_VEC', 'yellow'), verbose) try: # we don't need GPU for optimizing the graph from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference tf.gfile.MakeDirs(args.output_dir) config_fp = args.config_name logger.info('model config: %s' % config_fp) # 加载bert配置文件 with tf.gfile.GFile(config_fp, 'r') as f: bert_config = modeling.BertConfig.from_dict(json.load(f)) logger.info('build graph...') # input placeholders, not sure if they are friendly to XLA input_ids = tf.placeholder(tf.int32, (None, args.max_seq_len), 'input_ids') input_mask = tf.placeholder(tf.int32, (None, args.max_seq_len), 'input_mask') input_type_ids = tf.placeholder(tf.int32, (None, args.max_seq_len), 'input_type_ids') jit_scope = tf.contrib.compiler.jit.experimental_jit_scope with jit_scope(): input_tensors = [input_ids, input_mask, input_type_ids] model = modeling.BertModel(config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=input_type_ids, use_one_hot_embeddings=False) # 获取所有要训练的变量 tvars = tf.trainable_variables() init_checkpoint = args.ckpt_name (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) # 共享卷积核 with tf.variable_scope("pooling"): # 如果只有一层,就只取对应那一层的weight if len(args.layer_indexes) == 1: encoder_layer = model.all_encoder_layers[ args.layer_indexes[0]] else: # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 all_layers = [ model.all_encoder_layers[l] for l in args.layer_indexes ] encoder_layer = tf.concat(all_layers, -1) mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1) masked_reduce_mean = lambda x, m: tf.reduce_sum( mul_mask(x, m), axis=1) / (tf.reduce_sum( m, axis=1, keepdims=True) + 1e-10) input_mask = tf.cast(input_mask, tf.float32) # 以下代码是句向量的生成方法,可以理解为做了一个卷积的操作,但是没有把结果相加, 卷积核是input_mask pooled = masked_reduce_mean(encoder_layer, input_mask) pooled = tf.identity(pooled, 'final_encodes') output_tensors = [pooled] tmp_g = tf.get_default_graph().as_graph_def() # allow_soft_placement:自动选择运行设备 config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: logger.info('load parameters from checkpoint...') sess.run(tf.global_variables_initializer()) logger.info('freeze...') tmp_g = tf.graph_util.convert_variables_to_constants( sess, tmp_g, [n.name[:-2] for n in output_tensors]) dtypes = [n.dtype for n in input_tensors] logger.info('optimize...') tmp_g = optimize_for_inference( tmp_g, [n.name[:-2] for n in input_tensors], [n.name[:-2] for n in output_tensors], [dtype.as_datatype_enum for dtype in dtypes], False) tmp_file = tempfile.NamedTemporaryFile('w', delete=False, dir=args.output_dir).name logger.info('write graph to a tmp file: %s' % tmp_file) with tf.gfile.GFile(tmp_file, 'wb') as f: f.write(tmp_g.SerializeToString()) return tmp_file except Exception as e: logger.error('fail to optimize the graph!') logger.error(e)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] next_sentence_labels = features["next_sentence_labels"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, compute_type=tf.float16 if FLAGS.manual_fp16 else tf.float32) (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (next_sentence_loss, next_sentence_example_loss, next_sentence_log_probs) = get_next_sentence_output( bert_config, model.get_pooled_output(), next_sentence_labels) masked_lm_loss = tf.identity(masked_lm_loss, name="mlm_loss") next_sentence_loss = tf.identity(next_sentence_loss, name="nsp_loss") total_loss = masked_lm_loss + next_sentence_loss total_loss = tf.identity(total_loss, name='total_loss') tvars = tf.trainable_variables() initialized_variable_names = {} if init_checkpoint and (hvd is None or hvd.rank() == 0): (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" %d :: name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, FLAGS.manual_fp16, FLAGS.use_fp16) output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape(masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax( masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) next_sentence_log_probs = tf.reshape( next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) next_sentence_predictions = tf.argmax( next_sentence_log_probs, axis=-1, output_type=tf.int32) next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) next_sentence_accuracy = tf.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions) next_sentence_mean_loss = tf.metrics.mean( values=next_sentence_example_loss) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, "next_sentence_accuracy": next_sentence_accuracy, "next_sentence_loss": next_sentence_mean_loss, } eval_metric_ops = metric_fn( masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels ) output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=eval_metric_ops) else: raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) return output_spec
def create_model(bert_config, is_training, slot_list, features, num_class_labels, use_one_hot_embeddings): """Creates a classification model.""" input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. class_output_layer = model.get_pooled_output() token_output_layer = model.get_sequence_output() token_output_shape = modeling.get_shape_list(token_output_layer, expected_rank=3) batch_size = token_output_shape[0] seq_length = token_output_shape[1] hidden_size = token_output_shape[2] # Define prediction variables class_proj_layer_dim = [hidden_size] for idx in range(FLAGS.num_class_hidden_layer): class_proj_layer_dim.append(64) class_proj_layer_dim.append(num_class_labels) token_proj_layer_dim = [hidden_size] for idx in range(FLAGS.num_token_hidden_layer): token_proj_layer_dim.append(64) token_proj_layer_dim.append(2) if is_training: # I.e., 0.1 dropout class_output_layer = tf.nn.dropout(class_output_layer, keep_prob=(1 - FLAGS.dropout_rate)) token_output_layer = tf.nn.dropout(token_output_layer, keep_prob=(1 - FLAGS.dropout_rate)) total_loss = 0 per_slot_per_example_loss = {} per_slot_class_logits = {} per_slot_start_logits = {} per_slot_end_logits = {} for slot in slot_list: start_pos = features["start_pos_%s" % slot] end_pos = features["end_pos_%s" % slot] class_label_id = features["class_label_id_%s" % slot] slot_scope_name = "slot_%s" % slot if slot == 'price range': slot_scope_name = "slot_price" with tf.variable_scope(slot_scope_name): class_list_output_weights = [] class_list_output_bias = [] for l_idx in range(len(class_proj_layer_dim) - 1): dim_in = class_proj_layer_dim[l_idx] dim_out = class_proj_layer_dim[l_idx + 1] class_list_output_weights.append( tf.get_variable( "class/output_weights_%d" % l_idx, [dim_in, dim_out], initializer=tf.truncated_normal_initializer( stddev=0.02))) class_list_output_bias.append( tf.get_variable("class/output_bias_%d" % l_idx, [dim_out], initializer=tf.zeros_initializer())) token_list_output_weights = [] token_list_output_bias = [] for l_idx in range(len(token_proj_layer_dim) - 1): dim_in = token_proj_layer_dim[l_idx] dim_out = token_proj_layer_dim[l_idx + 1] token_list_output_weights.append( tf.get_variable( "token/output_weights_%d" % l_idx, [dim_in, dim_out], initializer=tf.truncated_normal_initializer( stddev=0.02))) token_list_output_bias.append( tf.get_variable("token/output_bias_%d" % l_idx, [dim_out], initializer=tf.zeros_initializer())) with tf.variable_scope("loss"): class_logits = util.fully_connect_layers( class_output_layer, class_list_output_weights, class_list_output_bias) one_hot_class_labels = tf.one_hot(class_label_id, depth=num_class_labels, dtype=tf.float32) class_loss = tf.losses.softmax_cross_entropy( one_hot_class_labels, class_logits, reduction=tf.losses.Reduction.NONE) token_is_pointable = tf.cast(tf.equal(class_label_id, 2), dtype=tf.float32) token_output_layer = tf.reshape( token_output_layer, [batch_size * seq_length, hidden_size]) token_logits = util.fully_connect_layers( token_output_layer, token_list_output_weights, token_list_output_bias) token_logits = tf.reshape(token_logits, [batch_size, seq_length, 2]) token_logits = tf.transpose(token_logits, [2, 0, 1]) unstacked_token_logits = tf.unstack(token_logits, axis=0) (start_logits, end_logits) = (unstacked_token_logits[0], unstacked_token_logits[1]) def compute_loss(logits, positions): one_hot_positions = tf.one_hot(positions, depth=seq_length, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=1) loss = -tf.reduce_sum(one_hot_positions * log_probs, axis=1) return loss token_loss = ( compute_loss(start_logits, start_pos) + compute_loss(end_logits, end_pos)) / 2.0 # per example if not FLAGS.location_loss_for_nonpointable: token_loss *= token_is_pointable per_example_loss = FLAGS.class_loss_ratio * class_loss + ( 1 - FLAGS.class_loss_ratio) * token_loss total_loss += tf.reduce_sum(per_example_loss) per_slot_per_example_loss[slot] = per_example_loss per_slot_class_logits[slot] = class_logits per_slot_start_logits[slot] = start_logits per_slot_end_logits[slot] = end_logits return (total_loss, per_slot_per_example_loss, per_slot_class_logits, per_slot_start_logits, per_slot_end_logits)
def create_ner_model(bert_config, is_training, input_ids, input_mask, segment_ids, num_token_labels, num_predicate_labels, max_seq_length): """ :param bert_config: :param is_training: :param input_ids: :param input_mask: :param segment_ids: :param labels: :param num_labels: :param use_one_hot_embedding: :return: """ # import tensorflow as tf # import modeling # 通过传入的训练数据,进行representation model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, ) # We "pool" the model by simply taking the hidden state corresponding # to the first token. float Tensor of shape [batch_size, hidden_size] predicate_output_layer = model.get_pooled_output() intent_hidden_size = predicate_output_layer.shape[-1].value predicate_output_weights = tf.get_variable( "predicate_output_weights", [num_predicate_labels, intent_hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) predicate_output_bias = tf.get_variable("predicate_output_bias", [num_predicate_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("predicate_loss"): if is_training: # I.e., 0.1 dropout predicate_output_layer = tf.nn.dropout(predicate_output_layer, keep_prob=0.9) predicate_logits = tf.matmul(predicate_output_layer, predicate_output_weights, transpose_b=True) predicate_logits = tf.nn.bias_add(predicate_logits, predicate_output_bias) predicate_probabilities = tf.nn.softmax(predicate_logits, axis=-1) predicate_prediction = tf.argmax(predicate_probabilities, axis=-1, output_type=tf.int32) # predicate_labels = tf.one_hot(predicate_label_id, depth=num_predicate_labels, dtype=tf.float32) # predicate_per_example_loss = tf.reduce_sum( # tf.nn.sigmoid_cross_entropy_with_logits(logits=predicate_logits, labels=predicate_labels), -1) # predicate_loss = tf.reduce_mean(predicate_per_example_loss) # """Gets final hidden layer of encoder. # # Returns: # float Tensor of shape [batch_size, seq_length, hidden_size] corresponding # to the final hidden of the transformer encoder. # """ token_label_output_layer = model.get_sequence_output() token_label_hidden_size = token_label_output_layer.shape[-1].value token_label_output_weight = tf.get_variable( "token_label_output_weights", [num_token_labels, token_label_hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) token_label_output_bias = tf.get_variable( "token_label_output_bias", [num_token_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("token_label_loss"): if is_training: token_label_output_layer = tf.nn.dropout(token_label_output_layer, keep_prob=0.9) token_label_output_layer = tf.reshape(token_label_output_layer, [-1, token_label_hidden_size]) token_label_logits = tf.matmul(token_label_output_layer, token_label_output_weight, transpose_b=True) token_label_logits = tf.nn.bias_add(token_label_logits, token_label_output_bias) token_label_logits = tf.reshape(token_label_logits, [-1, max_seq_length, num_token_labels]) # token_label_log_probs = tf.nn.log_softmax(token_label_logits, axis=-1) # token_label_one_hot_labels = tf.one_hot(token_label_ids, depth=num_token_labels, dtype=tf.float32) # token_label_per_example_loss = -tf.reduce_sum(token_label_one_hot_labels * token_label_log_probs, axis=-1) # token_label_loss = tf.reduce_sum(token_label_per_example_loss) token_label_probabilities = tf.nn.softmax(token_label_logits, axis=-1) # token_label_predictions = tf.argmax(token_label_probabilities, axis=-1) # return (token_label_loss, token_label_per_example_loss, token_label_logits, token_label_predict) # loss = 0.5 * predicate_loss + token_label_loss # return (loss, # predicate_loss, predicate_per_example_loss, predicate_probabilities, predicate_prediction, # token_label_loss, token_label_per_example_loss, token_label_logits, token_label_predictions) return (predicate_probabilities, token_label_probabilities)
#token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) #token_type_ids = tf.constant(np.random.randint(0,2, [2, 3])) token_type_ids = tf.placeholder(shape=[2, 3], dtype=tf.int32, name='token_type_ids') config = modeling.BertConfig(vocab_size=32000, hidden_size=768, num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) model = modeling.BertModel(config=config, is_training=True, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) label_embeddings = tf.get_variable( name="word_embeddings", shape=[768, 12], initializer=tf.truncated_normal_initializer(0.02)) pooled_output = model.get_pooled_output() logits = tf.matmul(pooled_output, label_embeddings) with tf.compat.v1.Session() as sess: sess.run(tf.global_variables_initializer()) rand_array = np.random.randint(0, 1, [2, 3]) print( sess.run(logits,
def create_classification_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels): """ :param bert_config: :param is_training: :param input_ids: :param input_mask: :param segment_ids: :param labels: :param num_labels: :param use_one_hot_embedding: :return: """ # import tensorflow as tf # import modeling # 通过传入的训练数据,进行representation model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, ) embedding_layer = model.get_sequence_output() output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value # predict = CNN_Classification(embedding_chars=embedding_layer, # labels=labels, # num_tags=num_labels, # sequence_length=FLAGS.max_seq_length, # embedding_dims=embedding_layer.shape[-1].value, # vocab_size=0, # filter_sizes=[3, 4, 5], # num_filters=3, # dropout_keep_prob=FLAGS.dropout_keep_prob, # l2_reg_lambda=0.001) # loss, predictions, probabilities = predict.add_cnn_layer() output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.sigmoid(logits) if labels is not None: label_ids = tf.cast(labels, tf.float32) per_example_loss = tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=label_ids), axis=-1) loss = tf.reduce_mean(per_example_loss) else: loss, per_example_loss = None, None return (loss, per_example_loss, logits, probabilities)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """ 待返回的模型函数,model_fn :param features: 输入dict :param labels: 标签 :param mode: 模式,训练还是 eval :param params: :return: 输出结果 """ # 记录特征名称与形状 tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) # 获取各个输入 input_ids = features["input_ids"] # 输入的id张量, [batch_size, seq_len] input_mask = features["input_mask"] # 输入的mask张量, [batch_size, seq_len] segment_ids = features["segment_ids"] # 第一句、第二句, [batch_size, seq_len] masked_lm_positions = features[ "masked_lm_positions"] # 语言模型中被遮蔽的位置, [batch_size, masked_len] masked_lm_ids = features[ "masked_lm_ids"] # 遮蔽语言模型的标签, [batch_size, masked_len] masked_lm_weights = features[ "masked_lm_weights"] # 遮蔽语言模型中被遮蔽的标签的权重, [batch_size, masked_len] next_sentence_labels = features[ "next_sentence_labels"] # 下一句预测的标签, [batch_size] is_training = (mode == tf.estimator.ModeKeys.TRAIN) # 是否训练模型 # 获取bert模型,具体参考modeling.py文件 model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # 下游任务,遮蔽语言模型的相关处理 (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) # 下游任务,下一句预测的相关处理 (next_sentence_loss, next_sentence_example_loss, next_sentence_log_probs) = get_next_sentence_output( bert_config, model.get_pooled_output(), next_sentence_labels) total_loss = masked_lm_loss + next_sentence_loss # 计算两个任务的整体loss计算 tvars = tf.trainable_variables() # 模型中可训练参数 initialized_variable_names = {} #被初始化的变量名字 scaffold_fn = None # 用已有模型初始化参数 if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: # tpu相关变量初始化 def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: # 非tpu变量初始化 tf.train.init_from_checkpoint(init_checkpoint, assignment_map) # 记录各变量名称与形状 tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: # 训练模式 # 创建优化器 train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) # 训练返回结果 output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: # 验证模式 def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels): """ 度量函数,计算模型的loss与acc :param masked_lm_example_loss: 遮蔽语言模型的样本loss, [batch_size, masked_len] :param masked_lm_log_probs: 遮蔽语言模型的对数概率值, [batch_size*masked_len, voc_size] :param masked_lm_ids: 遮蔽语言模型的标签id, [batch_size, masked_len] :param masked_lm_weights: 遮蔽语言模型的标签权重, [batch_size, masked_len] :param next_sentence_example_loss: 下一句预测的样本loss, [batch_size] :param next_sentence_log_probs: 下一句预测的对数概率值, [batch_size, 2] :param next_sentence_labels: 下一句预测的标签, [batch_size] :return: """ # 除最后一个维度,其他维度铺平, [batch_size*masked_len, voc_size] masked_lm_log_probs = tf.reshape( masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) # 获取概率最大的位置得到预测值, [batch_size*masked_len] masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape( masked_lm_example_loss, [-1]) # 铺平loss, [batch_size*masked_len] masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) # 铺平, [batch_size*masked_len] masked_lm_weights = tf.reshape( masked_lm_weights, [-1]) # 铺平, [batch_size*masked_len] # 根据真实值与预测值计算acc masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) # 根据各个样本loss与权重计算整体loss masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) # 除最后一个维度,其他维度铺平, [batch_size, 2] next_sentence_log_probs = tf.reshape( next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) # 获取概率最大的位置得到预测值, [batch_size] next_sentence_predictions = tf.argmax(next_sentence_log_probs, axis=-1, output_type=tf.int32) next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) # 平铺, [batch_size] # 根据真实值与预测值计算acc next_sentence_accuracy = tf.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions) # 计算下一句预测的平均loss next_sentence_mean_loss = tf.metrics.mean( values=next_sentence_example_loss) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, "next_sentence_accuracy": next_sentence_accuracy, "next_sentence_loss": next_sentence_mean_loss, } # 验证时的度量函数与参数 eval_metrics = (metric_fn, [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels ]) # 验证模式的输出结果 output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) return output_spec # 返回输出结果
def main(_): tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) processors = { "cola": ColaProcessor, "mnlim": MnliMProcessor, "mnlimm": MnliMMProcessor, "mrpc": MrpcProcessor, "qnli": QnliProcessor, "qqp": QqpProcessor, "rte": RteProcessor, "sst2": Sst2Processor, "stsb": StsbProcessor, "wnli": WnliProcessor, "ax": AxProcessor, "mnlimdevastest": MnliMDevAsTestProcessor } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_eval: raise ValueError("At least 'do_eval' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.io.gfile.makedirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() print("Current task", task_name) if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() # special handling for mnlimdevastest if task_name == 'mnlimdevastest': task_name = 'mnlim' label_list = processor.get_labels() print("Label list of current task", label_list) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) print("num_actual_eval_examples", num_actual_eval_examples) batch_size = FLAGS.eval_batch_size embed_dim = FLAGS.hidden_size # hidden size, 768 for BERT-base, 512 for BERT-small seq_length = FLAGS.max_seq_length num_labels = len(label_list) # Define some placeholders for the input input_ids_ph = tf.compat.v1.placeholder(tf.int32, shape=[None, seq_length], name='input_ids') input_mask_ph = tf.compat.v1.placeholder(tf.int32, shape=[None, seq_length], name='input_mask') segment_ids_ph = tf.compat.v1.placeholder(tf.int32, shape=[None, seq_length], name='segment_ids') label_ids_ph = tf.compat.v1.placeholder(tf.int32, shape=[ None, ], name='label_ids') tf.compat.v1.logging.info( "Running single-head masking out and direct evaluation!") # we want to mask out the individual head and then evaluate. So there are 12 layers * 12 heads results. n_layers = 12 n_heads = 12 folder = FLAGS.output_dir save_file = 'single_head_mask.pickle' output = np.zeros((n_layers, n_heads)) # two placeholders for the head coordinates, layer, head head_mask_ph = tf.compat.v1.placeholder(tf.int32, shape=[ None, ], name='head_mask') layer_mask_ph = tf.compat.v1.placeholder(tf.int32, shape=[ None, ], name='layer_mask') model = modeling.BertModel( config=bert_config, is_training=False, input_ids=input_ids_ph, # input_ids, input_mask=input_mask_ph, # input_mask, token_type_ids=segment_ids_ph, # segment_ids, use_one_hot_embeddings=False, head_mask=head_mask_ph, layer_mask=layer_mask_ph) output_layer = model.get_pooled_output() output_weights = tf.get_variable( "output_weights", [num_labels, embed_dim], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) with tf.compat.v1.variable_scope("loss"): # for stsb if num_labels == 1: logits = tf.squeeze(logits, [-1]) per_example_loss = tf.square(logits - label_ids_ph) loss = tf.reduce_mean(per_example_loss) else: log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids_ph, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) # metric and summary # metric is tf.metric object, (val, op) metric = metric_fn(per_example_loss, label_ids_ph, logits, num_labels, task_name) metric_name = list(metric.keys()) metric_val = [m[0] for m in metric.values()] metric_op = [m[1] for m in metric.values()] init_checkpoint = FLAGS.init_checkpoint tvars = tf.compat.v1.trainable_variables() saver_init = tf.train.Saver(tvars) # Isolate the variables stored behind the scenes by the metric operation var_metric = [] for key in metric.keys(): var_metric.extend( tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope=key)) # Define initializer to initialize/reset running variables metric_vars_initializer = tf.variables_initializer(var_list=var_metric) config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True with tf.compat.v1.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) saver_init.restore(sess, init_checkpoint) # if number of eval examples < 1000, just load it directly, or load by batch. if num_actual_eval_examples <= 1000: eval_input_ids, eval_input_mask, eval_segment_ids, \ eval_label_ids, eval_is_real_example = generate_ph_input(batch_size=num_actual_eval_examples, seq_length=seq_length, examples=eval_examples, label_list=label_list, tokenizer=tokenizer) # loop over layers, then loop over heads for l in range(n_layers): for h in range(n_heads): cur_l, cur_h = l, h head_mask = [h] layer_mask = [l] # if number of eval examples < 1000, just load it directly, or load by batch. if num_actual_eval_examples <= 1000: sess.run(metric_vars_initializer) sess.run(metric_op, feed_dict={ input_ids_ph: eval_input_ids, input_mask_ph: eval_input_mask, segment_ids_ph: eval_segment_ids, label_ids_ph: eval_label_ids, head_mask_ph: head_mask, layer_mask_ph: layer_mask }) eval_metric_val = sess.run(metric_val) else: num_batch_eval = num_actual_eval_examples // batch_size \ if num_actual_eval_examples % batch_size == 0 \ else num_actual_eval_examples // batch_size + 1 id_eval = 0 sess.run(metric_vars_initializer) for _ in range(num_batch_eval): eval_input_ids, eval_input_mask, eval_segment_ids, \ eval_label_ids, eval_is_real_example = generate_ph_input(batch_size=batch_size, seq_length=seq_length, examples=eval_examples, label_list=label_list, tokenizer=tokenizer, train_idx_offset=id_eval) id_eval += batch_size sess.run(metric_op, feed_dict={ input_ids_ph: eval_input_ids, input_mask_ph: eval_input_mask, segment_ids_ph: eval_segment_ids, label_ids_ph: eval_label_ids, head_mask_ph: head_mask, layer_mask_ph: layer_mask }) eval_metric_val = sess.run(metric_val) for name, val in zip(metric_name, eval_metric_val): if name == 'accuracy': output[cur_l][cur_h] = val print( "Mask out the head in (Layer {}, Head {}) | {}: {}" .format(cur_l, cur_h, name, val)) joblib.dump(output, folder + save_file)
def main(_): mode = tf.estimator.ModeKeys.TRAIN use_one_hot_embeddings = FLAGS.use_tpu tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) # model_fn = mode_hot_embeddings=FLAGS.usl_fn_builder( # # bert_config=bert_config, # # init_checkpoint=FLAGS.init_checkpoint, # # learning_rate=FLAGS.learning_rate, # # num_train_steps=FLAGS.num_train_steps, # # num_warmup_steps=FLAGS.num_warmup_steps, # # use_tpu=FLAGS.use_tpu, # # use_onee_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. # estimator = tf.contrib.tpu.TPUEstimator( # use_tpu=FLAGS.use_tpu, # model_fn=model_fn, # config=run_config, # train_batch_size=FLAGS.train_batch_size, # eval_batch_size=FLAGS.eval_batch_size) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) n_gpus = 4 batch_size = FLAGS.train_batch_size d = input_fn(input_files, FLAGS.train_batch_size * n_gpus, FLAGS.max_seq_length, FLAGS.max_predictions_per_seq, True) features, iterator = parse_input_fn_result(d) # train_input_fn = input_fn_builder( # input_files=input_files, # max_seq_length=FLAGS.max_seq_length, # max_predictions_per_seq=FLAGS.max_predictions_per_seq, # is_training=True) # estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) input_ids_list = tf.split(features["input_ids"], n_gpus, axis=0) input_mask_list = tf.split(features["input_mask"], n_gpus, axis=0) segment_ids_list = tf.split(features["segment_ids"], n_gpus, axis=0) masked_lm_positions_list = tf.split(features["masked_lm_positions"], n_gpus, axis=0) masked_lm_ids_list = tf.split(features["masked_lm_ids"], n_gpus, axis=0) masked_lm_weights_list = tf.split(features["masked_lm_weights"], n_gpus, axis=0) next_sentence_labels_list = tf.split(features["next_sentence_labels"], n_gpus, axis=0) # multi-gpu train with tf.device('/cpu:0'): optimizer = optimization_gpu.create_optimizer( None, FLAGS.learning_rate, FLAGS.num_train_steps, FLAGS.num_warmup_steps, False) global_step = tf.train.get_or_create_global_step() # calculate the gradients on each GPU tower_grads = [] models = [] train_perplexity = tf.get_variable( 'train_perplexity', [], initializer=tf.constant_initializer(0.0), trainable=False) for k in range(n_gpus): with tf.device('/gpu:%d' % k): with tf.variable_scope('lm', reuse=k > 0): # calculate the loss for one model replica and get # lstm states input_ids = input_ids_list[k] input_mask = input_mask_list[k] segment_ids = segment_ids_list[k] masked_lm_positions = masked_lm_positions_list[k] masked_lm_ids = masked_lm_ids_list[k] masked_lm_weights = masked_lm_weights_list[k] next_sentence_labels = next_sentence_labels_list[k] is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (next_sentence_loss, next_sentence_example_loss, next_sentence_log_probs) = get_next_sentence_output( bert_config, model.get_pooled_output(), next_sentence_labels) total_loss = masked_lm_loss + next_sentence_loss loss = total_loss models.append(model) # get gradients grads = optimizer.compute_gradients( loss, aggregation_method=tf.AggregationMethod. EXPERIMENTAL_TREE, ) tower_grads.append(grads) # keep track of loss across all GPUs train_perplexity += loss average_grads = average_gradients(tower_grads, None, None) average_grads, norm_summary_ops = clip_grads( average_grads, 10.0, True, global_step) train_perplexity = tf.exp(train_perplexity / n_gpus) train_op = optimizer.apply_gradients(average_grads, global_step=global_step) init = tf.global_variables_initializer() saver = tf.train.Saver(tf.global_variables(), max_to_keep=2) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: sess.run(init) sess.run(iterator.initializer) sum = 0 count = 0 t0 = time.time() while True: _, train_perplexity_ = sess.run([train_op, train_perplexity]) sum += train_perplexity_ count += 1 if count % 100 == 0: print("------------") print(time.time() - t0, " ms") t0 = time.time() print("loss ", sum / count) sum = 0 if count % 10000 == 0: checkpoint_path = os.path.join(FLAGS.output_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=global_step) if FLAGS.do_eval: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def gec_create_model(bert_config, is_training, input_sequence, input_mask, segment_ids, edit_sequence, use_one_hot_embeddings, mode, copy_weight, use_bert_more, insert_ids, multitoken_insert_ids, subtract_replaced_from_replacement): """Creates a classification model.""" # insert_ids: word ids of unigram inserts (list) # multitoken_insert_ids: word_ids of bigram inserts (list of tuples of length 2) # Defining the space of all possible edits: # unk, sos and eos are dummy edits mapped to 0, 1 and 2 respectively # copy is mapped to 3 # del is mapped to 4 num_appends = len(insert_ids) + len(multitoken_insert_ids) num_replaces = num_appends # appends and replacements come from the same set (inserts and multitoken_inserts) append_begin = 5 # First append edit (mapped to 5) append_end = append_begin + num_appends - 1 #Last append edit rep_begin = append_end + 1 # First replace edit rep_end = rep_begin + num_replaces - 1 #Last replace edit num_suffix_transforms = 58 #num of transformation edits num_labels = 5 + num_appends + num_replaces + num_suffix_transforms # total number of edits print("************ num of labels : {} ***************".format(num_labels)) config = bert_config input_sequence_shape = modeling.get_shape_list(input_sequence,2) batch_size = input_sequence_shape[0] seq_len = input_sequence_shape[1] if not use_bert_more: #default use of bert (without logit factorisation) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_sequence, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_sequence_output() else: # LOGIT FACTORISATION is On! model = modified_modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_sequence, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_sequence_output() replace_layer = output_layer[:,seq_len:2*seq_len,:] #representation of replacement slots as described in paper append_layer = output_layer[:,2*seq_len:3*seq_len,:] #representation of append slots as described in paper output_layer = output_layer[:,0:seq_len,:] output_layer_shape = modeling.get_shape_list(output_layer,3) hidden_size = output_layer_shape[-1] flattened_output_layer = tf.reshape(output_layer,[-1, hidden_size]) h_edit = flattened_output_layer if use_bert_more: h_word = flattened_output_layer flattened_replace_layer = tf.reshape(replace_layer,[-1, hidden_size]) flattened_append_layer = tf.reshape(append_layer, [-1, hidden_size]) m_replace = flattened_replace_layer m_append = flattened_append_layer with tf.variable_scope("cls/predictions"): with tf.variable_scope("transform"): h_word = tf.layers.dense( h_word, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) h_word = modeling.layer_norm(h_word) with tf.variable_scope("cls/predictions",reuse=True): with tf.variable_scope("transform",reuse=True): m_replace = tf.layers.dense( m_replace, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) m_replace = modeling.layer_norm(m_replace) with tf.variable_scope("cls/predictions",reuse=True): with tf.variable_scope("transform",reuse=True): m_append = tf.layers.dense( m_append, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) m_append = modeling.layer_norm(m_append) word_embedded_input = model.word_embedded_input flattened_word_embedded_input = tf.reshape(word_embedded_input, [-1, hidden_size]) labels = edit_sequence edit_weights = tf.get_variable( "edit_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) if is_training: h_edit = tf.nn.dropout(h_edit, keep_prob=0.9) if use_bert_more: # append/replace weight vector for a given append or replace operation # correspond to word embedding for its token argument # for multitoken append/replace (e.g. has been) # weight vector is sum of word embeddings of token arguments append_weights = edit_word_embedding_lookup(model.embedding_table, insert_ids, use_one_hot_embeddings, config.vocab_size, config.hidden_size) replace_weights = append_weights #tokens in replace and append vocab are same #(i.e. inserts and multitoken_inserts) multitoken_append_weights = wem_utils.edit_embedding_loopkup(model.embedding_table, multitoken_insert_ids, use_one_hot_embeddings, config.vocab_size, config.hidden_size) multitoken_replace_weights = multitoken_append_weights #tokens in replace and append vocab are same #(i.e. inserts and multitoken_inserts) append_weights = tf.concat([append_weights, multitoken_append_weights],0) replace_weights = tf.concat([replace_weights, multitoken_replace_weights],0) with tf.variable_scope("loss"): edit_logits = tf.matmul(h_edit, edit_weights, transpose_b=True) #first term in eq3 in paper logits = edit_logits if use_bert_more: #=============== inplace_word_logits==============# #2nd term in eq3 in paper inplace_logit = tf.reduce_sum(h_word * flattened_word_embedded_input, axis=1, keepdims=True) #copy #inplace_logit = tf.reduce_sum(m_replace * flattened_word_embedded_input, axis=1, keepdims=True) #copy inplace_logit_appends = tf.tile(inplace_logit,[1,num_appends]) inplace_logit_transforms = tf.tile(inplace_logit,[1,num_suffix_transforms]) zero_3_logits = tf.zeros([batch_size*seq_len,3]) #unk sos eos zero_1_logits = tf.zeros([batch_size*seq_len,1]) # del zero_replace_logits = tf.zeros([batch_size*seq_len,num_replaces]) concat_list = [zero_3_logits, inplace_logit, zero_1_logits]\ + [inplace_logit_appends]\ + [zero_replace_logits]\ + [inplace_logit_transforms] inplace_word_logits = tf.concat(concat_list,1) #======additional (insert,replace) logits ====# #3rd term in eqn3 in paper zero_5_logits = tf.zeros([batch_size*seq_len,5]) append_logits = tf.matmul(m_append, append_weights, transpose_b=True) if subtract_replaced_from_replacement: replace_logits = replacement_minus_replaced_logits(m_replace, flattened_word_embedded_input, replace_weights) else: replace_logits = tf.matmul(m_replace, replace_weights, transpose_b=True) suffix_logits = tf.zeros([batch_size*seq_len,num_suffix_transforms]) concat_list = [zero_5_logits, append_logits, replace_logits, suffix_logits] additional_logits = tf.concat(concat_list,1) #====================================================# logits = edit_logits + inplace_word_logits + additional_logits logits_bias = tf.get_variable("output_bias", shape=[num_labels], initializer=tf.zeros_initializer()) logits += logits_bias logits = tf.reshape(logits, [output_layer_shape[0], output_layer_shape[1], num_labels]) log_probs = tf.nn.log_softmax(logits, axis=-1) probs = tf.nn.softmax(logits,axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) per_token_loss = per_token_loss * tf.to_float(input_mask) mask = copy_weight*tf.to_float(tf.equal(labels,3)) + tf.to_float(tf.not_equal(labels,3)) masked_per_token_loss = per_token_loss * mask per_example_loss = tf.reduce_sum(masked_per_token_loss, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, probs)
def create_model(bert_config, is_training, input_ids, head_ids, tail_ids, position1_ids, position2_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, head_ids=head_ids, tail_ids=tail_ids, position1_ids=position1_ids, position2_ids=position2_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # [ batch_size, seq_length, hidden_size ] encoder_layer = model.get_all_encoder_layers()[-1] mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1) masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / ( tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10) sentence_embedding = masked_reduce_mean(encoder_layer, input_mask) pos_head_embedding = model.get_head_embedding() pos_tail_embedding = model.get_tail_embedding() neg_head_embedding = tf.random_shuffle(pos_head_embedding) neg_tail_embedding = tf.random_shuffle(pos_tail_embedding) hidden_size = encoder_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout sentence_embedding = tf.nn.dropout(sentence_embedding, keep_prob=0.9) pos = tf.reduce_sum(abs(pos_tail_embedding + pos_head_embedding - sentence_embedding), axis=1, keep_dims=True) neg = tf.reduce_sum(max( abs(neg_tail_embedding + pos_head_embedding - sentence_embedding), abs(pos_tail_embedding + neg_head_embedding - sentence_embedding)), axis=1, keep_dims=True) pre_trans_loss = tf.maximum(pos - neg + FLAGS.marign, 0) loss = tf.reduce_mean(pre_trans_loss) logits = tf.matmul(sentence_embedding, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.softmax(logits, axis=-1) # log_probs = tf.nn.log_softmax(logits, axis=-1) # # one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) # # per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) # loss = tf.reduce_mean(per_example_loss) # return (loss, per_example_loss, logits, probabilities) return loss, pre_trans_loss, logits, probabilities
param['num_class'] = len(index2label) # bert input input_ids = tf.placeholder (shape = [None, param['sentence_len']], dtype = tf.int32, name = 'input_ids') input_mask = tf.placeholder (shape = [None, param['sentence_len']], dtype = tf.int32, name = 'input_mask') segment_ids = tf.placeholder (shape = [None, param['sentence_len']], dtype = tf.int32, name = 'segment_ids') input_labels = tf.placeholder (shape = [None, param['num_class']], dtype = tf.float32, name = 'input_ids') train_flag = tf.placeholder (dtype = tf.bool, name = 'is_training') dropout_keep_prob = tf.placeholder(dtype = tf.float32, name = 'dropout_keep_prob') learning_rate = tf.placeholder(dtype = tf.float32, name = 'learning_rate') bert_config = modeling.BertConfig.from_json_file(param['bert_config_path']) model = modeling.BertModel( config = bert_config, is_training = train_flag, input_ids = input_ids, input_mask = input_mask, token_type_ids = segment_ids, use_one_hot_embeddings = False # If you use TPU, set it True else False ) output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value # 768 # your own full concect layer output_weights = tf.get_variable('output_weights', [hidden_size, param['num_class']], initializer = tf.truncated_normal_initializer(stddev = 0.02)) output_bias = tf.get_variable('output_bias', [param['num_class']], initializer = tf.zeros_initializer()) with tf.variable_scope('loss'): output_layer = tf.nn.dropout(output_layer, keep_prob = dropout_keep_prob) logits = tf.matmul(output_layer, output_weights) logits = tf.nn.bias_add(logits, output_bias)
configsession.gpu_options.allow_growth = True sess = tf.Session(config=configsession) input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids") input_mask = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_mask") segment_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="segment_ids") with sess.as_default(): model = modeling.BertModel(config=bert_config, is_training=True, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False) encoder_last_layer = model.get_sequence_output() encoder_last2_layer = model.all_encoder_layers[-2] #saver = tf.train.Saver() sess.run(tf.global_variables_initializer() ) # 这里尤其注意,先初始化,在加载参数,否者会把bert的参数重新初始化。这里和demo1是有区别的 #saver.restore(sess, pathname) #print(1) token = tokenization.CharTokenizer( vocab_file="chinese_L-12_H-768_A-12/vocab.txt") query = u'美国大选,特朗普到底是咋想的,难道美国人民眼睛有问题吗?' split_tokens = token.tokenize(query) print(split_tokens) word_ids = token.convert_tokens_to_ids(split_tokens)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, seq_length, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) embedding = model.get_sequence_output() embeddings = tf.layers.dropout(embedding, rate=FLAGS.dropout_rate, training=is_training) with tf.variable_scope('Graph', reuse=None, custom_getter=None): # LSTM t = tf.transpose(embeddings, perm=[1, 0, 2]) lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell( 128) # 序列标注问题中一般lstm单元个数就是max_seq_length lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(128) lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw) output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=seq_length) output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=seq_length) output = tf.concat([output_fw, output_bw], axis=-1) output = tf.transpose(output, perm=[1, 0, 2]) tf.logging.info(output.shape) output = tf.layers.dropout(output, rate=0.5, training=is_training) output = tf.reshape(output, [-1, 128 * 256]) output_weights = tf.get_variable( "output_weights", [num_labels, 128 * 256], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) logits = tf.matmul(output, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) tf.logging.info("*****shape of label_ids******") tf.logging.info(label_ids.shape) tf.logging.info(logits.shape) correctPred = tf.equal( tf.argmax(logits, 1), tf.argmax(label_ids, 1) ) # tf.argmax: Returns the index with the largest value across axes of a tensor. accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32)) tf.summary.scalar('Accuracy', accuracy) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=label_ids)) # optimizer = tf.train.AdamOptimizer().minimize(loss) # crf_params = tf.get_variable("crf", [num_labels, num_labels], dtype=tf.float32) # trans = tf.get_variable( # "transitions", # shape=[num_labels, num_labels], # initializer=initializers.xavier_initializer()) # pred_ids, trans = tf.contrib.crf.crf_decode(logits, crf_params, seq_length) # log_likelihood, _ = tf.contrib.crf.crf_log_likelihood( # logits, label_ids, seq_length, crf_params) # loss = tf.reduce_mean(-log_likelihood) # if mode == tf.estimator.ModeKeys.EVAL: # return tf.estimator.EstimatorSpec( # mode, loss=loss, eval_metric_ops=metrics) # elif mode == tf.estimator.ModeKeys.TRAIN: # return loss, logits, trans, pred_ids return loss, tf.argmax(logits, 1)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] next_sentence_labels = features["next_sentence_labels"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (next_sentence_loss, next_sentence_example_loss, next_sentence_log_probs) = get_next_sentence_output( bert_config, model.get_pooled_output(), next_sentence_labels) total_loss = masked_lm_loss + next_sentence_loss tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) # output_spec = None # if mode == tf.estimator.ModeKeys.TRAIN: # train_op = optimization.create_optimizer( # total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) # # output_spec = tf.contrib.tpu.TPUEstimatorSpec( # mode=mode, # loss=total_loss, # train_op=train_op, # scaffold_fn=scaffold_fn) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape( masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) next_sentence_log_probs = tf.reshape( next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) next_sentence_predictions = tf.argmax(next_sentence_log_probs, axis=-1, output_type=tf.int32) next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) next_sentence_accuracy = tf.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions) next_sentence_mean_loss = tf.metrics.mean( values=next_sentence_example_loss) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, "next_sentence_accuracy": next_sentence_accuracy, "next_sentence_loss": next_sentence_mean_loss, } # eval_metrics = (metric_fn, [ # masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, # masked_lm_weights, next_sentence_example_loss, # next_sentence_log_probs, next_sentence_labels # ]) # output_spec = tf.contrib.tpu.TPUEstimatorSpec( # mode=mode, # loss=total_loss, # eval_metrics=eval_metrics, # scaffold_fn=scaffold_fn) eval_metrics = metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels) output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=eval_metrics) else: raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) return output_spec
parser.add_argument("--ckpt_dir", default='/home/chandler/mask-bert/model/ew_75/model.ckpt-24643', type=str, help="model ckpt dir. E.g., MNLI/model_0/model.ckpt-24643") parser.add_argument("--output_dir", default='./fig/masked_weight.png', type=str, help="output png file dir") parser.add_argument("--layer_index", default=0, type=int, help="layer to plot") parser.add_argument("--matrix_name", default="wq", type=str, help="name of the matrix to plot. (wq, wk, wv, fc1, fc2, fc3)") parser.add_argument("--fig_type", default="pdf", type=str, help="figure file extension type") args = parser.parse_args() bert_config = modeling.BertConfig.from_json_file('../Model/uncased_L-12_H-768_A-12/bert_config.json') input_ids = tf.placeholder(tf.int32,(8,256)) model = modeling.BertModel( config=bert_config, is_training=False, input_ids=input_ids) saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) ckpt_dir = args.ckpt_dir saver.restore(sess, ckpt_dir) print(f'parsing file {ckpt_dir}') results = [] for probe_layer in range(0,12): mask_wq = sess.graph.get_tensor_by_name(f'bert/encoder/layer_{probe_layer}/attention/self/query/mask-o:0') mask_wk = sess.graph.get_tensor_by_name(f'bert/encoder/layer_{probe_layer}/attention/self/key/mask-o:0') mask_wv = sess.graph.get_tensor_by_name(f'bert/encoder/layer_{probe_layer}/attention/self/value/mask-o:0') mask_fc1 = sess.graph.get_tensor_by_name(f'bert/encoder/layer_{probe_layer}/attention/output/dense/mask-o:0')