def __bert_embedding(self, token_ids, token_masks, segment_ids, masks, keep_prob=0.8): """Compute BERT embeddings """ from bert import modeling bert_model = modeling.BertModel( config=self.bert_config, is_training=self.is_training, input_ids=token_ids, input_mask=token_masks, token_type_ids=segment_ids, use_one_hot_embeddings=False) bert_embeddings = bert_model.get_sequence_output() # (batch_size, bert_max_seq_length, bert_embedding_size) # initialize pre-trained bert if self.is_training and self.bert_init_checkpoint: tvars = tf.trainable_variables() (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, self.bert_init_checkpoint) tf.train.init_from_checkpoint(self.bert_init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) return tf.nn.dropout(bert_embeddings, keep_prob)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """ 创建X模型 :param bert_config: bert 配置 :param is_training: :param input_ids: 数据的idx 表示 :param input_mask: :param segment_ids: :param labels: 标签的idx 表示 :param num_labels: 类别数量 :param use_one_hot_embeddings: :return: """ # 使用数据加载BertModel,获取对应的字embedding model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings ) # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size] embedding = model.get_sequence_output() max_seq_length = embedding.shape[1].value used = tf.sign(tf.abs(input_ids)) lengths = tf.reduce_sum(used, reduction_indices=1) # [batch_size] 大小的向量,包含了当前batch中的序列长度 crf = CRF(embedded_chars=embedding, droupout_rate=FLAGS.droupout_rate, initializers=initializers, num_labels=num_labels, seq_length=max_seq_length, labels=labels, lengths=lengths, is_training=is_training) rst = crf.add_crf_layer() return rst
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_sequence_output() hidden_size = output_layer.shape[-1].value output_weight = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weight, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, 21]) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_sum(per_example_loss) probabilities = tf.nn.softmax(logits, axis=-1) predict = tf.argmax(probabilities, axis=-1) return (loss, per_example_loss, logits, predict)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): is_training_for_bert = is_training if FLAGS.use_feature_based: is_training_for_bert = False model = modeling.BertModel( config=bert_config, is_training=is_training_for_bert, # False for feature-based, is_training for fine-tuning input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings ) embedding = model.get_sequence_output() # (batch_size, seq_length, embedding_size) if is_training: # dropout embedding embedding = tf.layers.dropout(embedding, rate=FLAGS.bert_dropout_rate, training=is_training) embedding_size = embedding.shape[-1].value # embedding_size seq_length = embedding.shape[1].value used = tf.sign(tf.abs(input_ids)) lengths = tf.reduce_sum(used, reduction_indices=1) # (batch_size) print('seq_length', seq_length) print('lengths', lengths) def bi_lstm_fused(inputs, lengths, rnn_size, is_training, dropout_rate=0.5, scope='bi-lstm-fused'): with tf.variable_scope(scope): t = tf.transpose(inputs, perm=[1, 0, 2]) # Need time-major lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(rnn_size) lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(rnn_size) lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw) output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=lengths) output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=lengths) outputs = tf.concat([output_fw, output_bw], axis=-1) outputs = tf.transpose(outputs, perm=[1, 0, 2]) return tf.layers.dropout(outputs, rate=dropout_rate, training=is_training) def lstm_layer(inputs, lengths, is_training): rnn_output = tf.identity(inputs) for i in range(2): scope = 'bi-lstm-fused-%s' % i rnn_output = bi_lstm_fused(rnn_output, lengths, rnn_size=FLAGS.lstm_size, is_training=is_training, dropout_rate=FLAGS.bilstm_dropout_rate, scope=scope) # (batch_size, seq_length, 2*rnn_size) return rnn_output def project_layer(inputs, out_dim, seq_length, scope='project'): with tf.variable_scope(scope): in_dim = inputs.get_shape().as_list()[-1] weight = tf.get_variable('W', shape=[in_dim, out_dim], dtype=tf.float32, initializer=initializers.xavier_initializer()) bias = tf.get_variable('b', shape=[out_dim], dtype=tf.float32, initializer=tf.zeros_initializer()) t_output = tf.reshape(inputs, [-1, in_dim]) # (batch_size*seq_length, in_dim) output = tf.matmul(t_output, weight) + bias # (batch_size*seq_length, out_dim) output = tf.reshape(output, [-1, seq_length, out_dim]) # (batch_size, seq_length, out_dim) return output def loss_layer(logits, labels, num_labels, lengths, input_mask): trans = tf.get_variable( "transitions", shape=[num_labels, num_labels], initializer=initializers.xavier_initializer()) if FLAGS.use_crf: with tf.variable_scope("crf-loss"): log_likelihood, trans = tf.contrib.crf.crf_log_likelihood( inputs=logits, tag_indices=labels, transition_params=trans, sequence_lengths=lengths) per_example_loss = -log_likelihood loss = tf.reduce_mean(per_example_loss) return loss, per_example_loss, trans else: labels_one_hot = tf.one_hot(labels, num_labels) cross_entropy = labels_one_hot * tf.log(tf.nn.softmax(logits)) cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2) cross_entropy *= tf.to_float(input_mask) cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1) cross_entropy /= tf.cast(lengths, tf.float32) per_example_loss = cross_entropy loss = tf.reduce_mean(per_example_loss) return loss, per_example_loss, trans ''' # 1 logits = project_layer(embedding, num_labels, seq_length, scope='project') ''' ''' # 2 lstm_outputs = lstm_layer(embedding, lengths, is_training) p1 = project_layer(lstm_outputs, FLAGS.lstm_size, seq_length, scope='project-1') p2 = project_layer(p1, num_labels, seq_length, scope='project-2') logits = p2 ''' # 3 lstm_outputs = lstm_layer(embedding, lengths, is_training) logits = project_layer(lstm_outputs, num_labels, seq_length, scope='project') loss, per_example_loss, trans = loss_layer(logits, labels, num_labels, lengths, input_mask) if FLAGS.use_crf: pred_ids, _ = crf.crf_decode(potentials=logits, transition_params=trans, sequence_length=lengths) else: probabilities = tf.nn.softmax(logits, axis=-1) pred_ids = tf.argmax(probabilities,axis=-1) # masking for confirmation pred_ids *= input_mask print('#' * 20) print('shape of output_layer:', embedding.shape) print('embedding_size:%d' % embedding_size) print('seq_length:%d' % seq_length) print('shape of logit', logits.shape) print('shape of loss', loss.shape) print('shape of per_example_loss', per_example_loss.shape) print('num labels:%d' % num_labels) print('#' * 20) return (loss, per_example_loss, logits, trans, pred_ids)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, multilabel, sent_rels, sentiment, entailment_rels, entailment, corr_rels, correlation): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids) # Here, we are doing a classification task on the entire segment. For # token-level output, use model.get_sequece_output() instead. output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) # with open('Debug_file_1.txt', 'a+') as infile: # print(logits, file=infile) # Labels both for single and multilabel classification labels = tf.cast(labels, tf.float32) if multilabel: probabilities = tf.nn.sigmoid(logits) tf.logging.info("num_labels:{};logits:{};labels:{}".format( num_labels, logits, labels)) per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels, logits=logits) else: probabilities = tf.nn.softmax(logits, axis=-1) per_example_loss = tf.nn.softmax_cross_entropy_with_logits( labels=labels, logits=logits) loss = tf.reduce_mean(per_example_loss) # Add regularization based on label relations prior probs_exp = tf.expand_dims(probabilities, 1) m = tf.tile(probs_exp, [1, num_labels, 1]) probs_exp_t = tf.transpose(probs_exp, perm=[0, 2, 1]) # Subtract each prediction from all others: # Example (with batch size=1): # tiled predictions: [0.1] [0.1] [0.1] # [0.2] [0.2] [0.2] # [0.3] [0.3] [0.3] # subtract [0.1, 0.2, 0.3] row-wise # result: [0.0] [-.1] [-.2] --> row represents difference between # emotion 1 and all other emotions # [0.1] [0.0] [-.1] # [0.2] [0.1] [0.0] dists = tf.square(tf.subtract(m, probs_exp_t)) # square distances dists = tf.transpose(dists, perm=[0, 2, 1]) # Sentiment-based regularization sent_reg = tf.multiply( tf.constant(sentiment), tf.reduce_mean( tf.multiply(dists, tf.constant(sent_rels, dtype=tf.float32)))) tf.summary.scalar("sentiment_regularization", sent_reg) loss += sent_reg # Entailment-based regularization ent_reg = tf.multiply( tf.constant(entailment), tf.reduce_mean( tf.multiply(dists, tf.constant(entailment_rels, dtype=tf.float32)))) tf.summary.scalar("entailment_regularization", ent_reg) loss += ent_reg # Correlation-based regularization corr_reg = tf.multiply( tf.constant(correlation), tf.reduce_mean( tf.multiply(dists, tf.constant(corr_rels, dtype=tf.float32)))) tf.summary.scalar("correlation_regularization", corr_reg) loss += corr_reg tf.summary.scalar("loss", loss) return (loss, per_example_loss, output_layer, logits, probabilities)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, pos_embedding, dp_embedding, num_labels, use_one_hot_embeddings): # 这里是构建模型的重点,需要改变 model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_sequence_output() # 计算图:获得bert的输出 ''' output_layer : float Tensor of shape [batch_size, seq_length, hidden_size], the final hidden layer of the Transformer. ''' output_layer = tf.concat([output_layer, pos_embedding, dp_embedding], -1) hidden_size = output_layer.shape[-1].value output_weight = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weight, transpose_b=True) # (1024, 788) * (7, 788)^T 维度缩减 logits = tf.nn.bias_add(logits, output_bias) # (1024,7) + (7,) logits = tf.reshape( logits, [-1, FLAGS.max_seq_length, num_labels]) # 维度还原(8, 128, 7) if is_training: length = tf.constant(FLAGS.max_seq_length, shape=[ FLAGS.train_batch_size, ], dtype=tf.int32) else: length = tf.constant(FLAGS.max_seq_length, shape=[ FLAGS.eval_batch_size, ], dtype=tf.int32) # 注意!!!!crf要求每个batch都是足量的,所以使用时要么丢弃最后一个不足量的batch,要么补足 log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood( # 得到最大然似和转移矩阵 inputs=logits, # 输入的特征向量 [batch_size, max_seq_len, num_tags] tag_indices=labels, # 目标标签 [batch_size, max_seq_len] #空的转移矩阵 sequence_lengths=length) # (batch,)每个序列的长度 predict, viterbi_score = tf.contrib.crf.crf_decode( logits, transition_params, length) loss = tf.reduce_mean(-log_likelihood) # # log_probs = tf.nn.log_softmax(logits, axis=-1) #计算对数然似损失,与logit维数相同(8, 128, 7) # one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) #独热向量(7, 7) # per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) #相乘后,第三维求和,得到(8, 128) # loss = tf.reduce_sum(per_example_loss) #整个batch的loss # probabilities = tf.nn.softmax(logits, axis=-1) #计算最大然似损失,得到各标签概率,(8, 128, 7) # predict = tf.argmax(probabilities, axis=-1) #取第3维最大值为预测结果 return (loss, logits, predict)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, token_label_ids, predicate_label_id, num_token_labels, num_predicate_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # We "pool" the model by simply taking the hidden state corresponding # to the first token. float Tensor of shape [batch_size, hidden_size] predicate_output_layer = model.get_pooled_output() intent_hidden_size = predicate_output_layer.shape[-1].value predicate_output_weights = tf.get_variable( "predicate_output_weights", [num_predicate_labels, intent_hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) predicate_output_bias = tf.get_variable("predicate_output_bias", [num_predicate_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("predicate_loss"): if is_training: # I.e., 0.1 dropout predicate_output_layer = tf.nn.dropout(predicate_output_layer, keep_prob=0.9) predicate_logits = tf.matmul(predicate_output_layer, predicate_output_weights, transpose_b=True) predicate_logits = tf.nn.bias_add(predicate_logits, predicate_output_bias) predicate_probabilities = tf.nn.softmax(predicate_logits, axis=-1) predicate_prediction = tf.argmax(predicate_probabilities, axis=-1, output_type=tf.int32) predicate_labels = tf.one_hot(predicate_label_id, depth=num_predicate_labels, dtype=tf.float32) predicate_per_example_loss = tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits(logits=predicate_logits, labels=predicate_labels), -1) predicate_loss = tf.reduce_mean(predicate_per_example_loss) # """Gets final hidden layer of encoder. # # Returns: # float Tensor of shape [batch_size, seq_length, hidden_size] corresponding # to the final hidden of the transformer encoder. # """ token_label_output_layer = model.get_sequence_output() token_label_hidden_size = token_label_output_layer.shape[-1].value token_label_output_weight = tf.get_variable( "token_label_output_weights", [num_token_labels, token_label_hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) token_label_output_bias = tf.get_variable( "token_label_output_bias", [num_token_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("token_label_loss"): if is_training: token_label_output_layer = tf.nn.dropout(token_label_output_layer, keep_prob=0.9) token_label_output_layer = tf.reshape(token_label_output_layer, [-1, token_label_hidden_size]) token_label_logits = tf.matmul(token_label_output_layer, token_label_output_weight, transpose_b=True) token_label_logits = tf.nn.bias_add(token_label_logits, token_label_output_bias) token_label_logits = tf.reshape( token_label_logits, [-1, FLAGS.max_seq_length, num_token_labels]) token_label_log_probs = tf.nn.log_softmax(token_label_logits, axis=-1) token_label_one_hot_labels = tf.one_hot(token_label_ids, depth=num_token_labels, dtype=tf.float32) token_label_per_example_loss = -tf.reduce_sum( token_label_one_hot_labels * token_label_log_probs, axis=-1) token_label_loss = tf.reduce_sum(token_label_per_example_loss) token_label_probabilities = tf.nn.softmax(token_label_logits, axis=-1) token_label_predictions = tf.argmax(token_label_probabilities, axis=-1) # return (token_label_loss, token_label_per_example_loss, token_label_logits, token_label_predict) loss = 0.5 * predicate_loss + token_label_loss return (loss, predicate_loss, predicate_per_example_loss, predicate_probabilities, predicate_prediction, token_label_loss, token_label_per_example_loss, token_label_logits, token_label_predictions)
def create_original_varmisuse_model( bert_config, is_training, enable_sequence_masking, input_ids, input_mask, segment_ids, candidate_mask, target_mask, error_location_mask, use_one_hot_embeddings, multi_head_count = 2, ): """Creates a two-headed pointer model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) final_sequence = model.get_sequence_output() final_sequence_shape = modeling.get_shape_list(final_sequence, expected_rank=3) batch_size, sequence_length, hidden_size = final_sequence_shape cls_output = model.get_pooled_output() # Calculate pointer probabilities as the attention vector over program tokens. # Pointer network equations: # (1) M = tanh(Y * Wy_extend + h_extend * Wh_extend) # (2) multi_headed_alpha = softmax(M * w_extend) # Vector shapes: # (1) M: [batch_size, sequence_length, hidden_size] # (2) Wy: [hidden_size, hidden_size] # (3) Wh: [hidden_size, hidden_size] # (4) h: [batch_size, hidden_size] # (5) Y: [batch_size, sequence_length, hidden_size] # (6) w: [hidden_size, multi_head_count] # (7) multi_headed_alpha: [batch_size, sequence_length, multi_head_count] # (8) Wy_extend: Wy extended to [batch_size, hidden_size, hidden_size] # (9) Wh_extend: Wh extended to [batch_size, hidden_size, hidden_size] # (10) h_extend: h extended to [batch_size, sequence_length, hidden_size] # (11) w_extend: w extended to [batch_size, hidden_size, multi_head_count] wy = tf.get_variable( "Wy", shape=[hidden_size, hidden_size], dtype=tf.float32, initializer=contrib.layers.xavier_initializer()) wh = tf.get_variable( "Wh", shape=[hidden_size, hidden_size], dtype=tf.float32, initializer=contrib.layers.xavier_initializer()) w = tf.get_variable( "w", shape=[hidden_size, multi_head_count], dtype=tf.float32, initializer=contrib.layers.xavier_initializer()) # Dimensions: [batch_size, hidden_size, hidden_size] wy_extend = tf.tile(tf.expand_dims(wy, 0), [batch_size, 1, 1]) # Dimensions: [batch_size, hidden_size, hidden_size] wh_extend = tf.tile(tf.expand_dims(wh, 0), [batch_size, 1, 1]) # Dimensions: [batch_size, sequence_length, hidden_size] cls_output_extend = tf.tile( tf.expand_dims(cls_output, 1), [1, sequence_length, 1]) candidate_mask_expanded = tf.expand_dims(candidate_mask, 2) if enable_sequence_masking: # Mask sequence using `candidate_mask`. candidates_mask_extend = tf.tile(candidate_mask_expanded, [1, 1, hidden_size]) final_sequence_masked = tf.multiply(final_sequence, tf.to_float(candidates_mask_extend)) m = tf.tanh( tf.matmul(final_sequence_masked, wy_extend) + tf.matmul(cls_output_extend, wh_extend)) else: m = tf.tanh( tf.matmul(final_sequence, wy_extend) + tf.matmul(cls_output_extend, wh_extend)) # Dimension: [batch_size, hidden_size, multi_head_count] w_extend = tf.tile(tf.expand_dims(w, 0), [batch_size, 1, 1]) # Dimension: [batch_size, sequence_length, multi_head_count] logits = tf.matmul(m, w_extend) # Dimension: [batch_size, sequence_length, multi_head_count] candidates_mask_extend_to_heads = tf.tile(candidate_mask_expanded, [1, 1, multi_head_count]) # Mask logits using `candidate_mask`. logits_masked = tf.multiply( logits, tf.to_float(candidates_mask_extend_to_heads)) probabilities = tf.nn.softmax(logits_masked, axis=1) location_probabilities, repair_probabilities = tf.unstack( probabilities, axis=2) def compute_loss(labels, probabilities): return -tf.reduce_sum( tf.multiply(tf.to_float(labels), tf.log(tf.clip_by_value(probabilities, 1e-10, 1.0))), axis=1) localization_loss = compute_loss(error_location_mask, location_probabilities) repair_loss = compute_loss(target_mask, repair_probabilities) per_example_loss = localization_loss + repair_loss loss = tf.reduce_mean(per_example_loss) return loss, per_example_loss, logits_masked, probabilities
def __init__(self, bert_config, num_labels, seq_length, init_checkpoint): self.bert_config = bert_config self.num_labels = num_labels self.seq_length = seq_length self.tower_grads = [] self.losses = [] self.input_ids = tf.placeholder(tf.int32, [None, self.seq_length], name='input_ids') self.input_mask = tf.placeholder(tf.int32, [None, self.seq_length], name='input_mask') self.segment_ids = tf.placeholder(tf.int32, [None, self.seq_length], name='segment_ids') self.labels = tf.placeholder(tf.int32, [None], name='labels') self.batch_size = tf.placeholder(tf.int32, shape=[], name='batch_size') self.is_training = tf.placeholder(tf.bool, shape=[], name='is_training') print(self.batch_size) self.gpu_step = self.batch_size // gpu_nums global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) optimizer = optimization.AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) with tf.variable_scope(tf.get_variable_scope()) as outer_scope: pred = [] label = [] for d in range(gpu_nums): with tf.device("/gpu:%s" % d), tf.name_scope("%s_%s" % ("tower", d)): self.model = modeling.BertModel( config=self.bert_config, is_training=self.is_training, input_ids=self.input_ids[d * self.gpu_step:(d + 1) * self.gpu_step], input_mask=self.input_mask[d * self.gpu_step:(d + 1) * self.gpu_step], token_type_ids=self.segment_ids[d * self.gpu_step:(d + 1) * self.gpu_step]) print("GPU:", d) tvars = tf.trainable_variables() initialized_variable_names = {} if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_layer = self.model.get_pooled_output() logging.info(output_layer) if self.is_training == True: output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) match_1 = tf.strided_slice(output_layer, [0], [self.gpu_step], [2]) match_2 = tf.strided_slice(output_layer, [1], [self.gpu_step], [2]) match = tf.concat([match_1, match_2], 1) self.logits = tf.layers.dense(match, self.num_labels, name='fc', reuse=tf.AUTO_REUSE) #预测标签 self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1, name="pred") logging.info(self.y_pred_cls) #真实标签 self.r_labels = tf.strided_slice( self.labels[d * self.gpu_step:(d + 1) * self.gpu_step], [0], [self.gpu_step], [2]) logging.info(self.r_labels) one_hot_labels = tf.one_hot(self.r_labels, depth=self.num_labels, dtype=tf.float32) log_probs = tf.nn.log_softmax(self.logits, axis=-1) per_example_loss = - (30*one_hot_labels[:,0] * log_probs[:,0]) \ - (9*one_hot_labels[:,1] * log_probs[:,1]) \ - (2*one_hot_labels[:,2] * log_probs[:,2]) \ - (2*one_hot_labels[:,3] * log_probs[:,3]) \ - (9*one_hot_labels[:,4] * log_probs[:,4]) \ + 1e-10 self.loss = tf.reduce_mean(per_example_loss) #self.optim = optimization.create_optimizer(self.loss, learning_rate, num_train_steps, num_warmup_steps, False) tvars = tf.trainable_variables() grads = tf.gradients(self.loss, tvars) (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) self.tower_grads.append(list(zip(grads, tvars))) self.losses.append(self.loss) label.append(self.r_labels) pred.append(self.y_pred_cls) outer_scope.reuse_variables() with tf.name_scope("apply_gradients"), tf.device("/cpu:0"): gradients = self.average_gradients(self.tower_grads) train_op = optimizer.apply_gradients(gradients, global_step=global_step) new_global_step = global_step + 1 self.train_op = tf.group(train_op, [global_step.assign(new_global_step)]) self.losses = tf.reduce_mean(self.losses) self.pred = tf.concat(pred, 0) self.label = tf.concat(label, 0) logging.info(self.pred) logging.info(self.label)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] next_sentence_labels = features["next_sentence_labels"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, scope="bert") (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) """ (next_sentence_loss, next_sentence_example_loss, next_sentence_log_probs) = get_next_sentence_output( bert_config, model.get_pooled_output(), next_sentence_labels) """ # total_loss = masked_lm_loss + next_sentence_loss total_loss = masked_lm_loss tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: """ def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels): # Computes the loss and accuracy of the model. masked_lm_log_probs = tf.reshape(masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax( masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) next_sentence_log_probs = tf.reshape( next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) next_sentence_predictions = tf.argmax( next_sentence_log_probs, axis=-1, output_type=tf.int32) next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) next_sentence_accuracy = tf.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions) next_sentence_mean_loss = tf.metrics.mean( values=next_sentence_example_loss) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, "next_sentence_accuracy": next_sentence_accuracy, "next_sentence_loss": next_sentence_mean_loss, } """ def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape( masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss } eval_metrics = (metric_fn, [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights ]) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) return output_spec
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, start_labels, end_labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) embedding = model.get_sequence_output( ) # BERT模型输出的embedding [batch_size,max_seq_len,embedding_size] embedding -= (1.0 - tf.cast(tf.expand_dims(input_mask, 2), tf.float32)) * 1e10 dim = embedding.get_shape().as_list()[-1] # 125膨胀卷积 embedding = tf.layers.conv1d(embedding, filters=dim, kernel_size=3, padding="same", dilation_rate=1) embedding -= (1.0 - tf.cast(tf.expand_dims(input_mask, 2), tf.float32)) * 1e10 embedding = tf.layers.conv1d(embedding, filters=dim, kernel_size=3, padding="same", dilation_rate=2) embedding -= (1.0 - tf.cast(tf.expand_dims(input_mask, 2), tf.float32)) * 1e10 embedding = tf.layers.conv1d(embedding, filters=dim, kernel_size=3, padding="same", dilation_rate=5) embedding -= (1.0 - tf.cast(tf.expand_dims(input_mask, 2), tf.float32)) * 1e10 embedding = tf.layers.conv1d(embedding, filters=dim, kernel_size=3, padding="same", dilation_rate=1) embedding -= (1.0 - tf.cast(tf.expand_dims(input_mask, 2), tf.float32)) * 1e10 embedding = tf.layers.conv1d(embedding, filters=dim, kernel_size=3, padding="same", dilation_rate=2) embedding -= (1.0 - tf.cast(tf.expand_dims(input_mask, 2), tf.float32)) * 1e10 embedding = tf.layers.conv1d(embedding, filters=dim, kernel_size=3, padding="same", dilation_rate=5) embedding -= (1.0 - tf.cast(tf.expand_dims(input_mask, 2), tf.float32)) * 1e10 avgpool = tf.layers.average_pooling1d(embedding, pool_size=dim, padding="same", strides=1) # 一维卷积 relu激活 output = tf.layers.conv1d( avgpool, filters=128, kernel_size=3, activation=tf.nn.relu, padding="same") # [batch_size, max_seq_length, 128] # logits and loss start_logits = tf.layers.dense( output, units=num_labels) # [batch_size, max_seq_length, num_labels] end_logits = tf.layers.dense( output, units=num_labels) # [batch_size, max_seq_length, num_labels] start_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=start_labels, logits=start_logits) end_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=end_labels, logits=end_logits) start_loss = tf.reduce_sum( start_loss * tf.to_float(input_mask)) / tf.reduce_sum( tf.to_float(input_mask)) end_loss = tf.reduce_sum( end_loss * tf.to_float(input_mask)) / tf.reduce_sum( tf.to_float(input_mask)) loss = 0.5 * start_loss + 0.5 * end_loss return loss, start_logits, end_logits
def optimize_graph(logger=None, verbose=False, pooling_strategy=PoolingStrategy.REDUCE_MEAN, max_seq_len=40): if not logger: logger = set_logger(colored('BERT_VEC', 'yellow'), verbose) try: # we don't need GPU for optimizing the graph # 返回tensorflow并设置日志级别 tf = import_tf(device_id=0, verbose=verbose) from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference # allow_soft_placement:自动选择运行设备 # ConfigProto用来配置Session config = tf.ConfigProto(allow_soft_placement=True) config_fp = args.config_name init_checkpoint = args.ckpt_name logger.info('model config: %s' % config_fp) # 加载bert配置文件 with tf.gfile.GFile(config_fp, 'r') as f: bert_config = modeling.BertConfig.from_dict(json.load(f)) logger.info('build graph...') # input placeholders, not sure if they are friendly to XLA input_ids = tf.placeholder(tf.int32, (None, max_seq_len), 'input_ids') input_mask = tf.placeholder(tf.int32, (None, max_seq_len), 'input_mask') input_type_ids = tf.placeholder(tf.int32, (None, max_seq_len), 'input_type_ids') # xla加速 jit_scope = tf.contrib.compiler.jit.experimental_jit_scope if args.xla else contextlib.suppress with jit_scope(): input_tensors = [input_ids, input_mask, input_type_ids] model = modeling.BertModel(config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=input_type_ids, use_one_hot_embeddings=False) # 获取所有要训练的变量 tvars = tf.trainable_variables() (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) minus_mask = lambda x, m: x - tf.expand_dims(1.0 - m, axis=-1 ) * 1e30 mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1) masked_reduce_max = lambda x, m: tf.reduce_max(minus_mask(x, m), axis=1) masked_reduce_mean = lambda x, m: tf.reduce_sum( mul_mask(x, m), axis=1) / (tf.reduce_sum( m, axis=1, keepdims=True) + 1e-10) # 共享卷积核 with tf.variable_scope("pooling"): # 如果只有一层,就只取对应那一层的weight if len(args.layer_indexes) == 1: encoder_layer = model.all_encoder_layers[ args.layer_indexes[0]] else: # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 all_layers = [ model.all_encoder_layers[l] for l in args.layer_indexes ] encoder_layer = tf.concat(all_layers, -1) input_mask = tf.cast(input_mask, tf.float32) # 以下代码是句向量的生成方法,可以理解为做了一个卷积的操作,但是没有把结果相加, 卷积核是input_mask if pooling_strategy == PoolingStrategy.REDUCE_MEAN: pooled = masked_reduce_mean(encoder_layer, input_mask) elif pooling_strategy == PoolingStrategy.REDUCE_MAX: pooled = masked_reduce_max(encoder_layer, input_mask) elif pooling_strategy == PoolingStrategy.REDUCE_MEAN_MAX: pooled = tf.concat([ masked_reduce_mean(encoder_layer, input_mask), masked_reduce_max(encoder_layer, input_mask) ], axis=1) elif pooling_strategy == PoolingStrategy.FIRST_TOKEN or \ pooling_strategy == PoolingStrategy.CLS_TOKEN: pooled = tf.squeeze(encoder_layer[:, 0:1, :], axis=1) elif pooling_strategy == PoolingStrategy.LAST_TOKEN or \ pooling_strategy == PoolingStrategy.SEP_TOKEN: seq_len = tf.cast(tf.reduce_sum(input_mask, axis=1), tf.int32) rng = tf.range(0, tf.shape(seq_len)[0]) indexes = tf.stack([rng, seq_len - 1], 1) pooled = tf.gather_nd(encoder_layer, indexes) elif pooling_strategy == PoolingStrategy.NONE: pooled = mul_mask(encoder_layer, input_mask) else: raise NotImplementedError() pooled = tf.identity(pooled, 'final_encodes') output_tensors = [pooled] tmp_g = tf.get_default_graph().as_graph_def() # 保存计算图 with tf.Session(config=config) as sess: logger.info('load parameters from checkpoint...') sess.run(tf.global_variables_initializer()) logger.info('freeze...') tmp_g = tf.graph_util.convert_variables_to_constants( sess, tmp_g, [n.name[:-2] for n in output_tensors]) dtypes = [n.dtype for n in input_tensors] logger.info('optimize...') tmp_g = optimize_for_inference( tmp_g, [n.name[:-2] for n in input_tensors], [n.name[:-2] for n in output_tensors], [dtype.as_datatype_enum for dtype in dtypes], False) #tmp_file = tempfile.NamedTemporaryFile('w', delete=True).name #r = random.randint(1, 1000) #tmp_file = "./tmp_graph"+str(r) tmp_file = "./tmp_graph11" logger.info('write graph to a tmp file: %s' % tmp_file) with tf.gfile.GFile(tmp_file, 'wb') as f: f.write(tmp_g.SerializeToString()) return tmp_file except Exception as e: logger.error('fail to optimize the graph!') logger.error(e)
def create_classification_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels): """ :param bert_config: :param is_training: :param input_ids: :param input_mask: :param segment_ids: :param labels: :param num_labels: :param use_one_hot_embedding: :return: """ import tensorflow as tf from bert_base.bert import modeling # 通过传入的训练数据,进行representation model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, ) embedding_layer = model.get_sequence_output() output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value # predict = CNN_Classification(embedding_chars=embedding_layer, # labels=labels, # num_tags=num_labels, # sequence_length=FLAGS.max_seq_length, # embedding_dims=embedding_layer.shape[-1].value, # vocab_size=0, # filter_sizes=[3, 4, 5], # num_filters=3, # dropout_keep_prob=FLAGS.dropout_keep_prob, # l2_reg_lambda=0.001) # loss, predictions, probabilities = predict.add_cnn_layer() output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) if labels is not None: one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) else: loss, per_example_loss = None, None return (loss, per_example_loss, logits, probabilities)
def _create_model(self, mode, input_ids, input_mask, segment_ids, labels, slot_labels, labels_mask, drop_keep_prob, entity_type_ids, sequence_lengths): """Creates a LaserTagger model.""" is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel( config=self._config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=self._use_one_hot_embeddings) final_layer = model.get_sequence_output() # final_hidden = model.get_pooled_output() if is_training: # I.e., 0.1 dropout # final_hidden = tf.nn.dropout(final_hidden, keep_prob=drop_keep_prob) final_layer = tf.nn.dropout(final_layer, keep_prob=drop_keep_prob) # 结合实体信息 batch_size, seq_length = modeling.get_shape_list(input_ids) self.entity_type_embedding = tf.get_variable( name="entity_type_embedding", shape=(self.entity_type_num, self._config.hidden_size), dtype=tf.float32, trainable=True, initializer=tf.random_uniform_initializer( -self._config.initializer_range * 100, self._config.initializer_range * 100, seed=20)) with tf.init_scope(): impact_weight_init = tf.constant(1.0 / self.entity_type_num, dtype=tf.float32, shape=(1, self.entity_type_num)) self.impact_weight = tf.Variable(impact_weight_init, dtype=tf.float32, name="impact_weight") # 不同类型的影响权重 impact_weight_matrix = tf.tile(self.impact_weight, multiples=[batch_size * seq_length, 1]) entity_type_ids_matrix1 = tf.cast(tf.reshape( entity_type_ids, [batch_size * seq_length, self.entity_type_num]), dtype=tf.float32) entity_type_ids_matrix = tf.multiply(entity_type_ids_matrix1, impact_weight_matrix) entity_type_emb = tf.matmul(entity_type_ids_matrix, self.entity_type_embedding) final_layer = final_layer + tf.reshape(entity_type_emb, [ batch_size, seq_length, self._config.hidden_size ]) # TODO TODO # 0.7071067811865476是二分之根号二 # final_layer = tf.concat([final_layer, tf.reshape(entity_type_emb, [batch_size, seq_length,self._config.hidden_size])], axis=-1) if is_training: final_layer = tf.nn.dropout(final_layer, keep_prob=drop_keep_prob) (output_fw_seq, output_bw_seq), ((c_fw, h_fw), (c_bw, h_bw)) = tf.nn.bidirectional_dynamic_rnn( cell_fw=LSTMCell(self.lstm_hidden_size), cell_bw=LSTMCell(self.lstm_hidden_size), inputs=final_layer, sequence_length=sequence_lengths, dtype=tf.float32) layer_matrix = tf.concat([output_fw_seq, output_bw_seq], axis=-1) final_hidden = tf.concat([c_fw, c_bw], axis=-1) layer_matrix = tf.contrib.layers.layer_norm(inputs=layer_matrix, begin_norm_axis=-1, begin_params_axis=-1) intent_logits = tf.layers.dense( final_hidden, self._num_tags, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02), name="output_projection") slot_logits = tf.layers.dense( layer_matrix, self.num_slot_tags, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02), name="slot_projection") with tf.variable_scope("loss"): loss = None per_example_intent_loss = None per_example_slot_loss = None if mode != tf.estimator.ModeKeys.PREDICT: per_example_intent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=intent_logits) slot_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=slot_labels, logits=slot_logits) per_example_slot_loss = tf.truediv( tf.reduce_sum(slot_loss, axis=1), tf.cast(tf.reduce_sum(labels_mask, axis=1), tf.float32)) # from tensorflow.contrib.crf import crf_log_likelihood # from tensorflow.contrib.crf import viterbi_decode # batch_size = tf.shape(slot_logits)[0] # print(curLine(), batch_size, tf.constant([self._max_seq_length])) # length_batch = tf.tile(tf.constant([self._max_seq_length]), [batch_size]) # print(curLine(), batch_size, "length_batch:", length_batch) # per_example_slot_loss, self.transition_params = crf_log_likelihood(inputs=slot_logits, # tag_indices=slot_labels,sequence_lengths=length_batch) # print(curLine(), "per_example_slot_loss:", per_example_slot_loss) # shape=(batch_size,) # print(curLine(), "self.transition_params:", self.transition_params) # shape=(9, 9) loss = tf.reduce_mean(self.intent_ratio * per_example_intent_loss + self.slot_ratio * per_example_slot_loss) pred_intent = tf.cast(tf.argmax(intent_logits, axis=-1), tf.int32) pred_slot = tf.cast(tf.argmax(slot_logits, axis=-1), tf.int32) return (loss, per_example_slot_loss, pred_intent, pred_slot, batch_size, entity_type_emb, impact_weight_matrix, entity_type_ids_matrix, final_layer, slot_logits)
def get_mention_proposal_and_loss(self, instance, is_training, use_tpu=False): """ Desc: forward function for training mention proposal module. Args: instance: a tuple of train/dev/test data instance. e.g., (flat_input_ids, flat_doc_overlap_input_mask, flat_sentence_map, text_len, speaker_ids, gold_starts, gold_ends, cluster_ids) is_training: True/False is in the training process. """ self.use_tpu = use_tpu self.dropout = self.get_dropout(self.config.dropout_rate, is_training) flat_input_ids, flat_doc_overlap_input_mask, flat_sentence_map, text_len, speaker_ids, gold_starts, gold_ends, cluster_ids = instance # flat_input_ids: (num_window, window_size) # flat_doc_overlap_input_mask: (num_window, window_size) # flat_sentence_map: (num_window, window_size) # text_len: dynamic length and is padded to fix length # gold_start: (max_num_mention), mention start index in the original (NON-OVERLAP) document. Pad with -1 to the fix length max_num_mention. # gold_end: (max_num_mention), mention end index in the original (NON-OVERLAP) document. Pad with -1 to the fix length max_num_mention. # cluster_ids/speaker_ids is not used in the mention proposal model. flat_input_ids = tf.math.maximum( flat_input_ids, tf.zeros_like(flat_input_ids, tf.int32)) # (num_window * window_size) flat_doc_overlap_input_mask = tf.where( tf.math.greater_equal(flat_doc_overlap_input_mask, 0), x=tf.ones_like(flat_doc_overlap_input_mask, tf.int32), y=tf.zeros_like(flat_doc_overlap_input_mask, tf.int32)) # (num_window * window_size) # flat_doc_overlap_input_mask = tf.math.maximum(flat_doc_overlap_input_mask, tf.zeros_like(flat_doc_overlap_input_mask, tf.int32)) flat_sentence_map = tf.math.maximum( flat_sentence_map, tf.zeros_like(flat_sentence_map, tf.int32)) # (num_window * window_size) gold_start_end_mask = tf.cast( tf.math.greater_equal(gold_starts, tf.zeros_like(gold_starts, tf.int32)), tf.bool) # (max_num_mention) gold_start_index_labels = self.boolean_mask_1d( gold_starts, gold_start_end_mask, name_scope="gold_starts", use_tpu=self.use_tpu) # (num_of_mention) gold_end_index_labels = self.boolean_mask_1d( gold_ends, gold_start_end_mask, name_scope="gold_ends", use_tpu=self.use_tpu) # (num_of_mention) text_len = tf.math.maximum(text_len, tf.zeros_like( text_len, tf.int32)) # (num_of_non_empty_window) num_subtoken_in_doc = tf.math.reduce_sum( text_len) # the value should be num_subtoken_in_doc input_ids = tf.reshape( flat_input_ids, [-1, self.config.window_size]) # (num_window, window_size) input_mask = tf.ones_like(input_ids, tf.int32) # (num_window, window_size) model = modeling.BertModel(config=self.bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, use_one_hot_embeddings=False, scope='bert') doc_overlap_window_embs = model.get_sequence_output( ) # (num_window, window_size, hidden_size) doc_overlap_input_mask = tf.reshape( flat_doc_overlap_input_mask, [self.config.num_window, self.config.window_size ]) # (num_window, window_size) doc_flat_embs = self.transform_overlap_windows_to_original_doc( doc_overlap_window_embs, doc_overlap_input_mask) doc_flat_embs = tf.reshape(doc_flat_embs, [-1, self.config.hidden_size ]) # (num_subtoken_in_doc, hidden_size) expand_start_embs = tf.tile( tf.expand_dims(doc_flat_embs, 1), [1, num_subtoken_in_doc, 1 ]) # (num_subtoken_in_doc, num_subtoken_in_doc, hidden_size) expand_end_embs = tf.tile( tf.expand_dims(doc_flat_embs, 0), [num_subtoken_in_doc, 1, 1 ]) # (num_subtoken_in_doc, num_subtoken_in_doc, hidden_size) expand_mention_span_embs = tf.concat( [expand_start_embs, expand_end_embs], axis=-1 ) # (num_subtoken_in_doc, num_subtoken_in_doc, 2*hidden_size) expand_mention_span_embs = tf.reshape( expand_mention_span_embs, [-1, self.config.hidden_size * 2]) span_sequence_logits = self.ffnn( expand_mention_span_embs, self.config.hidden_size * 2, 1, dropout=self.dropout, name_scope="mention_span" ) # (num_subtoken_in_doc * num_subtoken_in_doc) if self.config.start_end_share: start_end_sequence_logits = self.ffnn( doc_flat_embs, self.config.hidden_size, 2, dropout=self.dropout, name_scope="mention_start_end") # (num_subtoken_in_doc, 2) start_sequence_logits, end_sequence_logits = tf.split( start_end_sequence_logits, axis=1) # start_sequence_logits -> (num_subtoken_in_doc, 1) # end_sequence_logits -> (num_subtoken_in_doc, 1) else: start_sequence_logits = self.ffnn( doc_flat_embs, self.config.hidden_size, 1, dropout=self.dropout, name_scope="mention_start") # (num_subtoken_in_doc) end_sequence_logits = self.ffnn( doc_flat_embs, self.config.hidden_size, 1, dropout=self.dropout, name_scope="mention_end") # (num_subtoken_in_doc) gold_start_sequence_labels = self.scatter_gold_index_to_label_sequence( gold_start_index_labels, num_subtoken_in_doc) # (num_subtoken_in_doc) gold_end_sequence_labels = self.scatter_gold_index_to_label_sequence( gold_end_index_labels, num_subtoken_in_doc) # (num_subtoken_in_doc) start_loss, start_sequence_probabilities = self.compute_score_and_loss( start_sequence_logits, gold_start_sequence_labels) end_loss, end_sequence_probabilities = self.compute_score_and_loss( end_sequence_logits, gold_end_sequence_labels) # *_loss -> a scalar # *_sequence_scores -> (num_subtoken_in_doc) gold_span_sequence_labels = self.scatter_span_sequence_labels( gold_start_index_labels, gold_end_index_labels, num_subtoken_in_doc) # (num_subtoken_in_doc * num_subtoken_in_doc) span_loss, span_sequence_probabilities = self.compute_score_and_loss( span_sequence_logits, gold_span_sequence_labels) # span_loss -> a scalar # span_sequence_probabilities -> (num_subtoken_in_doc * num_subtoken_in_doc) total_loss = self.config.loss_start_ratio * start_loss + self.config.loss_end_ratio * end_loss + self.config.loss_span_ratio * span_loss return total_loss, start_sequence_probabilities, end_sequence_probabilities, span_sequence_probabilities
input_mask = tf.placeholder(shape=[batch_size, max_seq_length], dtype=tf.int32, name="input_mask") segment_ids = tf.placeholder(shape=[batch_size, max_seq_length], dtype=tf.int32, name="segment_ids") ### input_labels = tf.placeholder(shape=batch_size, dtype=tf.int32, name="input_ids") # 创建bert模型 model = modeling.BertModel( config=bert_config, is_training=True, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings= False # 这里如果使用TPU 设置为True,速度会快些。使用CPU 或GPU 设置为False ,速度会快些。 ) output_layer = model.get_sequence_output( ) # 这个获取每个token的output 输入数据[batch_size, seq_length, embedding_size] 如果做seq2seq 或者ner 用这个 output_layer = model.get_pooled_output() # 这个获取句子的output hidden_size = output_layer.shape[-1].value #获取输出的维度 # 后面增加一个全连接 with tf.variable_scope('Last_Full'): logits = tf.layers.dense(output_layer, 2) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=input_labels,
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, span_encoding, max_answer_length, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # Get the logits for the start and end predictions. final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] if span_encoding == "independent": output_weights = tf.get_variable( "cls/coqa/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("cls/coqa/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape( final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) start_logits, end_logits = tf.unstack(logits, axis=2) elif span_encoding == "concat-mlp": with tf.variable_scope("coqa"): if is_training: # The batch size can be variable during inference. final_hidden.shape.assert_is_compatible_with( (batch_size, seq_length, hidden_size)) start_logits = compute_joint_mlp_logits(final_hidden, max_answer_length) start_logits = mask_joint_logits(input_mask, start_logits) end_logits = tf.zeros([batch_size], dtype=tf.float32) # dummy else: raise ValueError("Unknown span_encoding: %s" % span_encoding) # Get the logits for the answer type prediction. # TODO(epitler): Try variants here. answer_type_output_layer = model.get_pooled_output() answer_type_hidden_size = answer_type_output_layer.shape[-1].value num_answer_types = 5 # YES, NO, UNKNOWN, EXTRACTIVE, ABSTRACTIVE answer_type_output_weights = tf.get_variable( "answer_type_output_weights", [num_answer_types, answer_type_hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) answer_type_output_bias = tf.get_variable( "answer_type_output_bias", [num_answer_types], initializer=tf.zeros_initializer()) answer_type_logits = tf.matmul(answer_type_output_layer, answer_type_output_weights, transpose_b=True) answer_type_logits = tf.nn.bias_add(answer_type_logits, answer_type_output_bias) return (start_logits, end_logits, answer_type_logits)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, event_type_mask, trigger_mask, role_mask, num_labels, use_one_hot_embeddings): model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings ) # batch * seq_len * hidden_size sent_features = model.get_sequence_output() # event type features, mean pooling of each character in the event type word event_type_mask = tf.cast(event_type_mask, tf.float32) event_type_len = tf.reduce_sum(event_type_mask, axis=-1, keep_dims=True) event_type_features = tf.einsum("blh,bl->bh", sent_features, event_type_mask) / event_type_len event_type_features = tf.tile(event_type_features[:, None], [1, FLAGS.max_seq_length, 1]) # role features, mean pooling of each character in the role word role_mask = tf.cast(role_mask, tf.float32) role_len = tf.reduce_sum(role_mask, axis=-1, keep_dims=True) role_features = tf.einsum("blh,bl->bh", sent_features, role_mask) / role_len role_features = tf.tile(role_features[:, None], [1, FLAGS.max_seq_length, 1]) # trigger features, mean pooling of each character in the trigger word #trigger_mask = tf.cast(trigger_mask, tf.float32) #trigger_len = tf.reduce_sum(trigger_mask, axis=-1, keep_dims=True) #trigger_features = tf.einsum("blh,bl->bh", sent_features, trigger_mask) / trigger_len #trigger_features = tf.tile(trigger_features[:, None], [1, FLAGS.max_seq_length, 1]) # final_input = sent_features #final_input = tf.concat([sent_features, event_type_features, trigger_features, role_features], axis=-1) final_input = tf.concat([sent_features, event_type_features, role_features], axis=-1) if FLAGS.add_crf: logits = tf.layers.dense(final_input, num_labels, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02), name="dense_layer") trans = tf.get_variable( "transitions", [num_labels, num_labels], initializer=initializers.xavier_initializer() ) sequence_lengths = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32) log_likelihood, trans = tf.contrib.crf.crf_log_likelihood( inputs=logits, tag_indices=labels, transition_params=trans, sequence_lengths=sequence_lengths ) loss = tf.reduce_mean(-log_likelihood) pred_ids, _ = tf.contrib.crf.crf_decode(potentials=logits, transition_params=trans, sequence_length=sequence_lengths) return loss, pred_ids elif FLAGS.add_lstm: pass else: valid_label_num = tf.cast(tf.reduce_sum(input_mask), tf.float32) input_mask = tf.cast(input_mask, dtype=tf.float32) # dense layer mlp = tf.layers.dense(final_input, 768//2, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02), name='mlp', activation=tf.nn.tanh) # shape of logits, batch * seq_len * num_labels logits = tf.layers.dense(mlp, num_labels, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02), name="dense_layer") one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) loss = tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits) loss *= input_mask loss = tf.reduce_sum(loss) / valid_label_num probabilities = tf.math.softmax(logits, axis=-1) pred_ids = tf.math.argmax(probabilities, axis=-1) return loss, pred_ids
def get_predictions_and_loss(self, input_ids, input_mask, text_len, speaker_ids, genre, is_training, gold_starts, gold_ends, cluster_ids, sentence_map): model = modeling.BertModel( config=self.bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, use_one_hot_embeddings=False, scope='bert') all_encoder_layers = model.get_all_encoder_layers() mention_doc = model.get_sequence_output() # [batch_size, seq_length, hidden_size] self.dropout = self.get_dropout(self.config["dropout_rate"], is_training) num_sentences = tf.shape(mention_doc)[0] max_sentence_length = tf.shape(mention_doc)[1] mention_doc = self.flatten_emb_by_sentence(mention_doc, input_mask) # [num_words, hidden_size] num_words = util.shape(mention_doc, 0) antecedent_doc = mention_doc flattened_sentence_indices = sentence_map candidate_starts = tf.tile(tf.expand_dims(tf.range(num_words), 1), [1, self.max_span_width]) # [num_words, max_span_width] candidate_ends = candidate_starts + tf.expand_dims(tf.range(self.max_span_width), 0) # [num_words, max_span_width] candidate_start_sentence_indices = tf.gather(flattened_sentence_indices, candidate_starts) # [num_words, max_span_width] candidate_end_sentence_indices = tf.gather(flattened_sentence_indices, tf.minimum(candidate_ends, num_words - 1)) # [num_words, max_span_width] candidate_mask = tf.logical_and(candidate_ends < num_words, tf.equal(candidate_start_sentence_indices, candidate_end_sentence_indices)) # [num_words, max_span_width] flattened_candidate_mask = tf.reshape(candidate_mask, [-1]) # [num_words * max_span_width] candidate_starts = tf.boolean_mask(tf.reshape(candidate_starts, [-1]), flattened_candidate_mask) # [num_candidates] candidate_ends = tf.boolean_mask(tf.reshape(candidate_ends, [-1]), flattened_candidate_mask) # [num_candidates] candidate_sentence_indices = tf.boolean_mask(tf.reshape(candidate_start_sentence_indices, [-1]), flattened_candidate_mask) # [num_candidates] candidate_cluster_ids = self.get_candidate_labels(candidate_starts, candidate_ends, gold_starts, gold_ends, cluster_ids) # [num_candidates] candidate_span_emb = self.get_span_emb(mention_doc, mention_doc, candidate_starts, candidate_ends) # [num_candidates, emb] candidate_mention_scores = self.get_mention_scores(candidate_span_emb, candidate_starts, candidate_ends) candidate_mention_scores = tf.squeeze(candidate_mention_scores, 1) # [k] # beam size k = tf.minimum(3900, tf.to_int32(tf.floor(tf.to_float(num_words) * self.config["top_span_ratio"]))) c = tf.minimum(self.config["max_top_antecedents"], k) # pull from beam top_span_indices = coref_ops.extract_spans(tf.expand_dims(candidate_mention_scores, 0), tf.expand_dims(candidate_starts, 0), tf.expand_dims(candidate_ends, 0), tf.expand_dims(k, 0), num_words, True) # [1, k] top_span_indices.set_shape([1, None]) top_span_indices = tf.squeeze(top_span_indices, 0) # [k] top_span_starts = tf.gather(candidate_starts, top_span_indices) # [k] top_span_ends = tf.gather(candidate_ends, top_span_indices) # [k] top_span_emb = tf.gather(candidate_span_emb, top_span_indices) # [k, emb] top_span_cluster_ids = tf.gather(candidate_cluster_ids, top_span_indices) # [k] top_span_mention_scores = tf.gather(candidate_mention_scores, top_span_indices) # [k] genre_emb = tf.gather(tf.get_variable("genre_embeddings", [len(self.genres), self.config["feature_size"]], initializer=tf.truncated_normal_initializer(stddev=0.02)), genre) # [emb] if self.config['use_metadata']: speaker_ids = self.flatten_emb_by_sentence(speaker_ids, input_mask) top_span_speaker_ids = tf.gather(speaker_ids, top_span_starts) # [k]i else: top_span_speaker_ids = None dummy_scores = tf.zeros([k, 1]) # [k, 1] top_antecedents, top_antecedents_mask, top_fast_antecedent_scores, top_antecedent_offsets = self.coarse_to_fine_pruning(top_span_emb, top_span_mention_scores, c) num_segs, seg_len = util.shape(input_ids, 0), util.shape(input_ids, 1) word_segments = tf.tile(tf.expand_dims(tf.range(0, num_segs), 1), [1, seg_len]) flat_word_segments = tf.boolean_mask(tf.reshape(word_segments, [-1]), tf.reshape(input_mask, [-1])) mention_segments = tf.expand_dims(tf.gather(flat_word_segments, top_span_starts), 1) # [k, 1] antecedent_segments = tf.gather(flat_word_segments, tf.gather(top_span_starts, top_antecedents)) #[k, c] segment_distance = tf.clip_by_value(mention_segments - antecedent_segments, 0, self.config['max_training_sentences'] - 1) if self.config['use_segment_distance'] else None #[k, c] if self.config['fine_grained']: for i in range(self.config["coref_depth"]): with tf.variable_scope("coref_layer", reuse=(i > 0)): top_antecedent_emb = tf.gather(top_span_emb, top_antecedents) # [k, c, emb] top_antecedent_scores = top_fast_antecedent_scores + self.get_slow_antecedent_scores(top_span_emb, top_antecedents, top_antecedent_emb, top_antecedent_offsets, top_span_speaker_ids, genre_emb, segment_distance) # [k, c] top_antecedent_weights = tf.nn.softmax(tf.concat([dummy_scores, top_antecedent_scores], 1)) # [k, c + 1] top_antecedent_emb = tf.concat([tf.expand_dims(top_span_emb, 1), top_antecedent_emb], 1) # [k, c + 1, emb] attended_span_emb = tf.reduce_sum(tf.expand_dims(top_antecedent_weights, 2) * top_antecedent_emb, 1) # [k, emb] with tf.variable_scope("f"): f = tf.sigmoid(util.projection(tf.concat([top_span_emb, attended_span_emb], 1), util.shape(top_span_emb, -1))) # [k, emb] top_span_emb = f * attended_span_emb + (1 - f) * top_span_emb # [k, emb] else: top_antecedent_scores = top_fast_antecedent_scores top_antecedent_scores = tf.concat([dummy_scores, top_antecedent_scores], 1) # [k, c + 1] top_antecedent_cluster_ids = tf.gather(top_span_cluster_ids, top_antecedents) # [k, c] top_antecedent_cluster_ids += tf.to_int32(tf.log(tf.to_float(top_antecedents_mask))) # [k, c] same_cluster_indicator = tf.equal(top_antecedent_cluster_ids, tf.expand_dims(top_span_cluster_ids, 1)) # [k, c] non_dummy_indicator = tf.expand_dims(top_span_cluster_ids > 0, 1) # [k, 1] pairwise_labels = tf.logical_and(same_cluster_indicator, non_dummy_indicator) # [k, c] dummy_labels = tf.logical_not(tf.reduce_any(pairwise_labels, 1, keepdims=True)) # [k, 1] top_antecedent_labels = tf.concat([dummy_labels, pairwise_labels], 1) # [k, c + 1] loss = self.softmax_loss(top_antecedent_scores, top_antecedent_labels) # [k] loss = tf.reduce_sum(loss) # [] return [candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores], loss
def create_model(is_training=True, ): # region 模型超参数 # is_training = True # batch_size = 256 batch_size = 256 max_seq_len = 256 num_classes = 2 # 创建bert的输入 input_ids = tf.placeholder(shape=[None, max_seq_len], dtype=tf.int32, name="input_ids") input_mask = tf.placeholder(shape=[None, max_seq_len], dtype=tf.int32, name="input_mask") segment_ids = tf.placeholder(shape=[None, max_seq_len], dtype=tf.int32, name="segment_ids") keep_prob = tf.placeholder(tf.float32, name='keep_prob') # learning_rate = tf.placeholder(tf.float32, name='learning_rate') learning_rate = 0.01 num_train_steps = tf.placeholder(tf.float32, name='num_train_steps') # num_train_steps = tf.placeholder(tf.int32, name='num_train_steps') ### input_labels = tf.placeholder(shape=[ None, ], dtype=tf.int32, name="input_labels") # 创建bert模型 model = modeling.BertModel( config=BERT_CONFIG, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings= False # 这里如果使用TPU 设置为True,速度会快些。使用CPU 或GPU 设置为False ,速度会快些。 ) output_layer = model.get_pooled_output() # 这个获取句子的output hidden_size = output_layer.shape[-1].value # 获取输出的维度 # 定义全连接层 with tf.variable_scope("fc1"): output_layer = tf.nn.dropout(output_layer, keep_prob=keep_prob) fc1 = tf.get_variable(shape=[num_classes, hidden_size], dtype=tf.float32, initializer=tf.initializers.he_normal(), name="fc1") bias1 = tf.Variable(tf.zeros(shape=[ num_classes, ]), name='bias1') fc1 = tf.matmul(output_layer, fc1, transpose_b=True) + bias1 # 分类器 y_pred_cls = tf.argmax(tf.nn.softmax(fc1), 1, name='y_pred') # 预测类别 with tf.variable_scope("optimize"): # 将label进行onehot转化. one_hot_labels = tf.one_hot(input_labels, depth=2, dtype=tf.float32) # 损失函数,交叉熵 cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=fc1, labels=one_hot_labels) loss = tf.reduce_mean(cross_entropy) # 优化器 # train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) train_op = optimization.create_optimizer( loss, init_lr=learning_rate, num_train_steps=num_train_steps, num_warmup_steps=None, use_tpu=False) with tf.variable_scope("accuracy"): correct_pred = tf.equal(input_labels, tf.cast(y_pred_cls, dtype=tf.int32)) accuracy = tf.reduce_mean(tf.cast(correct_pred, dtype=tf.float32), name="accuracy") with tf.name_scope("summary"): tf.summary.scalar("loss", loss) tf.summary.scalar("accuracy", accuracy) merged_summary = tf.summary.merge_all() # output_layer = model.get_sequence_output() # 这个获取每个token的output 输入数据[batch_size, seq_length, embedding_size] 如果做seq2seq 或者ner 用这个 # partial_init = tf.initializers.variables([filters, fc1, bias1, fc2, bias2], name='partial_init') inputParams = { 'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids, 'input_labels': input_labels, 'keep_prob': keep_prob, # 'learning_rate':learning_rate, 'num_train_steps': num_train_steps } outputParams = { 'loss': loss, 'y_pred_cls': y_pred_cls, 'accuracy': accuracy, 'train_op': train_op, } summaryParams = {'merged_summary': merged_summary} return inputParams, outputParams, summaryParams
def create_model(bert_config, input_ids, input_masks, segment_ids, token_label_ids, sent_label_ids, token_label_list, sent_label_list, mode, use_tpu): """Creates a NLU model.""" is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_masks, token_type_ids=segment_ids, use_one_hot_embeddings=use_tpu) # If you want to use sentence-level output, use model.get_pooled_output() # If you want to use token-level output, use model.get_sequence_output() with tf.variable_scope("token", reuse=tf.AUTO_REUSE): token_result = model.get_sequence_output() token_result_mask = tf.cast(tf.expand_dims(input_masks, axis=-1), dtype=tf.float32) token_kernel_initializer = tf.glorot_uniform_initializer( seed=np.random.randint(10000), dtype=tf.float32) token_bias_initializer = tf.zeros_initializer token_dense_layer = tf.keras.layers.Dense( units=len(token_label_list), activation=None, use_bias=True, kernel_initializer=token_kernel_initializer, bias_initializer=token_bias_initializer, kernel_regularizer=None, bias_regularizer=None, trainable=True) token_dropout_layer = tf.keras.layers.Dropout( rate=0.1, seed=np.random.randint(10000)) token_result = token_dense_layer(token_result) if mode == tf.estimator.ModeKeys.TRAIN: token_result = token_dropout_layer(token_result) masked_token_predict = token_result * token_result_mask + MIN_FLOAT * ( 1 - token_result_mask) token_predict_ids = tf.cast(tf.argmax(tf.nn.softmax( masked_token_predict, axis=-1), axis=-1), dtype=tf.int32) with tf.variable_scope("sent", reuse=tf.AUTO_REUSE): sent_result = model.get_pooled_output() sent_result_mask = tf.cast(tf.reduce_max(input_masks, axis=-1, keepdims=True), dtype=tf.float32) sent_kernel_initializer = tf.glorot_uniform_initializer( seed=np.random.randint(10000), dtype=tf.float32) sent_bias_initializer = tf.zeros_initializer sent_dense_layer = tf.keras.layers.Dense( units=len(sent_label_list), activation=None, use_bias=True, kernel_initializer=sent_kernel_initializer, bias_initializer=sent_bias_initializer, kernel_regularizer=None, bias_regularizer=None, trainable=True) sent_dropout_layer = tf.keras.layers.Dropout( rate=0.1, seed=np.random.randint(10000)) sent_result = sent_dense_layer(sent_result) if mode == tf.estimator.ModeKeys.TRAIN: sent_result = sent_dropout_layer(sent_result) masked_sent_predict = sent_result * sent_result_mask + MIN_FLOAT * ( 1 - sent_result_mask) sent_predict_ids = tf.cast(tf.argmax(tf.nn.softmax(masked_sent_predict, axis=-1), axis=-1), dtype=tf.int32) loss = tf.constant(0.0, dtype=tf.float32) if mode not in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL]: return loss, token_predict_ids, sent_predict_ids if token_label_ids is not None: with tf.variable_scope("token_loss", reuse=tf.AUTO_REUSE): token_label = tf.cast(token_label_ids, dtype=tf.float32) token_label_mask = tf.cast(input_masks, dtype=tf.float32) masked_token_label = tf.cast(token_label * token_label_mask, dtype=tf.int32) token_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=masked_token_label, logits=masked_token_predict) token_loss = tf.reduce_sum( token_cross_entropy * token_label_mask) / tf.reduce_sum( tf.reduce_max(token_label_mask, axis=-1)) loss = loss + token_loss if sent_label_ids is not None: with tf.variable_scope("sent_loss", reuse=tf.AUTO_REUSE): sent_label = tf.cast(sent_label_ids, dtype=tf.float32) sent_label_mask = tf.cast(tf.reduce_max(input_masks, axis=-1), dtype=tf.float32) masked_sent_label = tf.cast(sent_label * sent_label_mask, dtype=tf.int32) sent_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=masked_sent_label, logits=masked_sent_predict) sent_loss = tf.reduce_sum( sent_cross_entropy * sent_label_mask) / tf.reduce_sum( tf.reduce_max(sent_label_mask, axis=-1)) loss = loss + sent_loss return loss, token_predict_ids, sent_predict_ids
def create_graph(graph_file, bert_config_file, init_checkpoint, max_seq_len, select_layers, output_dir='../bert/tmp'): #tf.reset_default_graph() #from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference tf.gfile.MakeDirs(output_dir) bert_config = modeling.BertConfig.from_json_file(bert_config_file) input_ids = tf.placeholder(tf.int32, (None, max_seq_len), 'input_ids') input_mask = tf.placeholder(tf.int32, (None, max_seq_len), 'input_mask') input_type_ids = tf.placeholder(tf.int32, (None, max_seq_len), 'input_type_ids') input_tensors = [input_ids, input_mask, input_type_ids] model = modeling.BertModel(config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=input_type_ids, use_one_hot_embeddings=False) tvars = tf.trainable_variables() (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) all_layers = [] if len(select_layers) == 1: encoder_layer = model.all_encoder_layers[select_layers[0]] else: for layer in select_layers: all_layers.append(model.all_encoder_layers[layer]) encoder_layer = tf.concat(all_layers, -1) #output_tensors = [encoder_layer] pooled = tf.identity(encoder_layer, 'final_encodes') output_tensors = [pooled] tmp_g = tf.get_default_graph().as_graph_def() config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) tmp_g = tf.graph_util.convert_variables_to_constants( sess, tmp_g, [n.name[:-2] for n in output_tensors]) #[print(n.name) for n in output_tensors] dtypes = [n.dtype for n in input_tensors] #[print(n.name) for n in input_tensors] tmp_g = optimize_for_inference( tmp_g, [n.name[:-2] for n in input_tensors], [n.name[:-2] for n in output_tensors], [dtype.as_datatype_enum for dtype in dtypes], False) tmp_file = graph_file with tf.gfile.GFile(tmp_file, 'wb') as f: f.write(tmp_g.SerializeToString()) return tmp_file
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = tf.reshape(features["input_ids"], [-1, FLAGS.max_seq_length]) input_mask = tf.reshape(features["input_mask"], [-1, FLAGS.max_seq_length]) segment_ids = tf.reshape(features["segment_ids"], [-1, FLAGS.max_seq_length]) label_types = features["label_types"] label_ids = features["label_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) is_real_example = tf.reduce_sum( tf.one_hot(label_types, FLAGS.k_size * 2), axis=1) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) (cpc_loss, _, logits, probabilities) = bilin_model_builder.create_model( model, label_ids, label_types, num_choices, k_size=FLAGS.k_size) if add_masking: mask_rate = FLAGS.mask_rate # search alternatives? max_predictions_per_seq = int(math.ceil(FLAGS.max_seq_length * mask_rate)) masked_lm_positions = tf.reshape(features["mask_indices"], [-1, max_predictions_per_seq]) masked_lm_ids = tf.reshape(features["target_token_ids"], [-1, max_predictions_per_seq]) masked_lm_weights = tf.reshape(features["target_token_weights"], [-1, max_predictions_per_seq]) (masked_lm_loss, _, _) = bilin_model_builder.get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) total_loss = cpc_loss + masked_lm_loss else: total_loss = cpc_loss masked_lm_loss = tf.constant([0]) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(cpc_loss, mlm_loss, label_ids, logits, is_real_example): """Collect metrics for function.""" predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) accuracy = tf.metrics.accuracy( labels=label_ids, predictions=predictions, weights=is_real_example) cpc_loss_metric = tf.metrics.mean(values=cpc_loss) mlm_loss_metric = tf.metrics.mean(values=mlm_loss) metric_dict = { "eval_accuracy": accuracy, "eval_cpc_loss": cpc_loss_metric, "eval_mlm_loss": mlm_loss_metric } for i in range(FLAGS.k_size * 2): metric_dict["acc" + str(i)] = tf.metrics.accuracy( labels=label_ids[:, i], predictions=predictions[:, i], weights=is_real_example[:, i]) return metric_dict eval_metrics = (metric_fn, [ cpc_loss, masked_lm_loss, label_ids, logits, is_real_example ]) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, predictions={"probabilities": probabilities}, scaffold_fn=scaffold_fn) return output_spec
def create_model(bert_config, is_training, input_ids, input_mask, P_mask, A_mask, B_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) all_out = model.get_sequence_output() hidden_size = all_out.shape[-1].value '''The next 3 lines of code work on GPU for boolean masking However, boolean_mask() is not implemented for TPU because it results in dynamic tensor sizes. P = tf.boolean_mask(all_out, P_mask) A = tf.boolean_mask(all_out, A_mask) B = tf.boolean_mask(all_out, B_mask)''' #The next 15 lines of code are a TPU workaround #Messy but works. There may be a better way. _P_mask = tf.cast(P_mask, tf.float32) _A_mask = tf.cast(A_mask, tf.float32) _B_mask = tf.cast(B_mask, tf.float32) _P_mask_ = tf.broadcast_to(_P_mask, shape=(tf.shape(all_out)[2], tf.shape(all_out)[0], tf.shape(all_out)[1])) P_mask_ = tf.transpose(_P_mask_, perm=[1, 2, 0]) _A_mask_ = tf.broadcast_to(_A_mask, shape=(tf.shape(all_out)[2], tf.shape(all_out)[0], tf.shape(all_out)[1])) A_mask_ = tf.transpose(_A_mask_, perm=[1, 2, 0]) _B_mask_ = tf.broadcast_to(_B_mask, shape=(tf.shape(all_out)[2], tf.shape(all_out)[0], tf.shape(all_out)[1])) B_mask_ = tf.transpose(_B_mask_, perm=[1, 2, 0]) P_ = tf.multiply(all_out, P_mask_) P = tf.reduce_sum(P_, axis=1) A_ = tf.multiply(all_out, A_mask_) A = tf.reduce_sum(A_, axis=1) B_ = tf.multiply(all_out, B_mask_) B = tf.reduce_sum(B_, axis=1) #End of TPU workaround PA = tf.multiply(P, A) PB = tf.multiply(P, B) PP = tf.multiply(P, P) AB = tf.multiply(A, B) N = tf.subtract(PP, AB) AB_weights = tf.get_variable( "AB_weights", [1, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) N_weights = tf.get_variable( "N_weights", [1, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) if is_training: # I'm not sure if dropout on weights, rather than data # is accepted practice, but it seemed to work AB_weights = tf.nn.dropout(AB_weights, keep_prob=0.9) N_weights = tf.nn.dropout(N_weights, keep_prob=0.9) A_out = tf.matmul(PA, AB_weights, transpose_b=True) B_out = tf.matmul(PB, AB_weights, transpose_b=True) N_out = tf.matmul(N, N_weights, transpose_b=True) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): logits = tf.concat([A_out, B_out, N_out], axis=1) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, probabilities)
def __init__( self, bert_config, char_config, is_training, # is_evaluation, input_token_ids, input_char_ids, labels, num_labels, use_char_representation=True, input_mask=None, segment_ids=None, use_one_hot_embeddings=False, # TPU加速则为True scope=None): """ :param bert_config: :param char_config: :param is_training: 处于estimator模式下的train模式 :param is_evaluation: 处于estimator模式下的evaluate模式 :param input_token_ids: :param input_char_ids: :param labels: 真实标签 :param num_labels: 标签个数,用于CRF的转移矩阵 :param input_mask: :param segment_ids: 用于Bert,不过这里没啥用处,因为只是处理一个ner的问题,所以bert默认都为0 :param use_one_hot_embeddings: 是否用tpu :param scope: """ self.bert_model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_token_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) self.token_output = self.bert_model.get_sequence_output() if use_char_representation: char_embed_dim = char_config['char_embed_dim'] filters = char_config['filters'] alphabet_size = char_config['alphabet_size'] activations = char_config['activations'] n_highway = char_config['n_highway'] projection_dim = char_config['projection_dim'] char_dropout_rate = char_config[ 'char_dropout_rate'] if is_training else 1.0 self.charcnn_model = CharRepresentation( char_input=input_char_ids, alphabet_size=alphabet_size, filters=filters, projection_dim=projection_dim, char_embed_dim=char_embed_dim, activations=activations, n_highway=n_highway, dropout_rate=char_dropout_rate) self.char_output = self.charcnn_model.get_highway_output() token_shape = modeling.get_shape_list(self.token_output, expected_rank=3) char_shape = modeling.get_shape_list(self.char_output, expected_rank=3) if token_shape[1] != char_shape[1]: raise ValueError( "The time steps of token representation (%d) is not the same as char representation (%d) " % (token_shape[1], char_shape[1])) self.final_output = tf.concat( [self.token_output, self.char_output], axis=-1) else: tf.logging.info( "****************BERT representation only***************") self.final_output = self.token_output sequece_lengths = tf.reduce_sum(input_mask, axis=-1) self.crf = CRF( input=self.final_output, labels=labels, num_labels=num_labels, lengths=sequece_lengths, is_training=is_training, # is_evaluation=is_evaluation # estimator模式下的evaluate模式还是需要返回损失函数的 )
# -*- coding: utf-8 -*- """ @Time : 2021/6/1 14:43 @Author : huangkai21 @file : bert_web.py """ import tensorflow as tf from bert import modeling import os import collections import six from gevent import monkey monkey.patch_all() from flask import Flask, request from gevent import pywsgi import numpy as np import json flags = tf.flags FLAGS = flags.FLAGS bert_path = r'E:\code\chinese_L-12_H-768_A-12/' flags.DEFINE_string( "bert_config_file", os.path.join(bert_path, 'bert_config.json'), "The config json file corresponding to the pre-trained BERT model.") flags.DEFINE_string("bert_vocab_file", os.path.join(bert_path, 'vocab.txt'), "The config vocab file") flags.DEFINE_string( "init_checkpoint", os.path.join(bert_path, 'bert_model.ckpt'), "Initial checkpoint (usually from a pre-trained BERT model).") app = Flask(__name__)
def cnn(self): '''Get the final token-level output of BERT model using get_sequence_output function, and use it as the input embeddings of CNN model. ''' with tf.name_scope('bert'): bert_model = modeling.BertModel( config=self.bert_config, is_training=self.config.is_training, input_ids=self.input_ids, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=self.config.use_one_hot_embeddings) embedding_inputs = bert_model.get_sequence_output() '''Use three convolution kernels to do convolution and pooling, and concat the three resutls.''' with tf.name_scope('conv'): pooled_outputs = [] for i, filter_size in enumerate(self.config.filter_sizes): with tf.compat.v1.variable_scope("conv-maxpool-%s" % filter_size, reuse=False): conv = tf.layers.conv1d(embedding_inputs, self.config.num_filters, filter_size, name='conv1d') pooled = tf.reduce_max(conv, reduction_indices=[1], name='gmp') pooled_outputs.append(pooled) num_filters_total = self.config.num_filters * len( self.config.filter_sizes) h_pool = tf.concat(pooled_outputs, 1) outputs = tf.reshape(h_pool, [-1, num_filters_total]) '''Add full connection layer and dropout layer''' with tf.name_scope('fc'): fc = tf.layers.dense(outputs, self.config.hidden_dim, name='fc1') fc = tf.nn.dropout(fc, self.keep_prob) fc = tf.nn.relu(fc) '''logits''' with tf.name_scope('logits'): self.logits = tf.layers.dense(fc, self.config.num_labels, name='logits') self.prob = tf.nn.softmax(self.logits) self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) '''Calculate loss. Convert predicted labels into one hot form. ''' with tf.name_scope('loss'): log_probs = tf.nn.log_softmax(self.logits, axis=-1) one_hot_labels = tf.one_hot(self.labels, depth=self.config.num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) self.loss = tf.reduce_mean(per_example_loss) '''optimizer''' with tf.name_scope('optimizer'): optimizer = tf.compat.v1.train.AdamOptimizer(self.config.lr) gradients, variables = zip(*optimizer.compute_gradients(self.loss)) gradients, _ = tf.clip_by_global_norm(gradients, self.config.clip) self.optim = optimizer.apply_gradients( zip(gradients, variables), global_step=self.global_step) '''accuracy''' with tf.name_scope('accuracy'): correct_pred = tf.equal(self.labels, self.y_pred_cls) self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
def model_fn(features, labels, mode, params): bert_config = modeling.BertConfig.from_json_file( 'bert-base-2020-03-19/bert_config.json') X = features['X'] input_masks = features['mask'] X_b = features['X_b'] input_masks_b = features['mask_b'] Y = features['label'][:, 0] with tf.compat.v1.variable_scope('bert', reuse=False): model = modeling.BertModel( config=bert_config, is_training=True, input_ids=X, input_mask=input_masks, use_one_hot_embeddings=False, ) summary = model.get_pooled_output() with tf.compat.v1.variable_scope('bert', reuse=True): model = modeling.BertModel( config=bert_config, is_training=True, input_ids=X_b, input_mask=input_masks_b, use_one_hot_embeddings=False, ) summary_b = model.get_pooled_output() vectors_concat = [summary, summary_b, tf.abs(summary - summary_b)] vectors_concat = tf.concat(vectors_concat, axis=1) logits = tf.layers.dense(vectors_concat, 2) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=Y)) tf.identity(loss, 'train_loss') accuracy = tf.metrics.accuracy(labels=Y, predictions=tf.argmax(logits, axis=1)) tf.identity(accuracy[1], name='train_accuracy') tvars = tf.trainable_variables() init_checkpoint = 'bert-base-2020-03-19/model.ckpt-2000002' assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(loss, learning_rate, num_train_steps, num_warmup_steps, False) estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: estimator_spec = tf.estimator.EstimatorSpec( mode=tf.estimator.ModeKeys.EVAL, loss=loss, eval_metric_ops={'accuracy': accuracy}, ) return estimator_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) num_choices = 2 read_size = num_choices + 1 input_ids = [ features["input_ids" + str(i)] for i in range(0, read_size) ] input_mask = [ features["input_mask" + str(i)] for i in range(0, read_size) ] segment_ids = [ features["segment_ids" + str(i)] for i in range(0, read_size) ] label_ids = features["labels"] label_ids = label_ids[:, 4] seq_length = input_ids[0].shape[-1] input_ids = tf.reshape(tf.stack(input_ids, axis=1), [-1, seq_length]) input_mask = tf.reshape(tf.stack(input_mask, axis=1), [-1, seq_length]) segment_ids = tf.reshape(tf.stack(segment_ids, axis=1), [-1, seq_length]) is_training = (mode == tf_estimator.ModeKeys.TRAIN) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) if FLAGS.bilin_preproc: (total_loss, per_example_loss, logits, probabilities) = model_builder.create_model_bilin( model, label_ids, num_choices) else: (total_loss, per_example_loss, logits, probabilities) = model_builder.create_model( model, label_ids, num_choices) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf_estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf_estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, label_ids, logits): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions) loss = tf.metrics.mean(values=per_example_loss) return { "eval_accuracy": accuracy, "eval_loss": loss, } eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, predictions={"probabilities": probabilities}, scaffold_fn=scaffold_fn) return output_spec
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, token_label_ids, predicate_matrix_ids, num_token_labels, num_predicate_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # We "pool" the model by simply taking the hidden state corresponding # to the first token. float Tensor of shape [batch_size, hidden_size] # model_pooled_output = model.get_pooled_output() # """Gets final hidden layer of encoder. # # Returns: # float Tensor of shape [batch_size, seq_length, hidden_size] corresponding # to the final hidden of the transformer encoder. # """ sequence_bert_encode_output = model.get_sequence_output() if is_training: sequence_bert_encode_output = tf.nn.dropout( sequence_bert_encode_output, keep_prob=0.9) with tf.variable_scope("predicate_head_select_loss"): bert_sequenc_length = sequence_bert_encode_output.shape[-2].value # shape [batch_size, sequence_length, sequencd_length, predicate_label_numbers] predicate_score_matrix = getHeadSelectionScores( encode_input=sequence_bert_encode_output, hidden_size_n1=100, label_number=num_predicate_labels) predicate_head_probabilities = tf.nn.sigmoid(predicate_score_matrix) # predicate_head_prediction = tf.argmax(predicate_head_probabilities, axis=3) predicate_head_predictions_round = tf.round( predicate_head_probabilities) predicate_head_predictions = tf.cast(predicate_head_predictions_round, tf.int32) # shape [batch_size, sequence_length, sequencd_length] predicate_matrix = tf.reshape( predicate_matrix_ids, [-1, bert_sequenc_length, bert_sequenc_length]) gold_predicate_matrix_one_hot = tf.one_hot(predicate_matrix, depth=num_predicate_labels, dtype=tf.float32) # shape [batch_size, sequence_length, sequencd_length, predicate_label_numbers] predicate_sigmoid_cross_entropy_with_logits = tf.nn.sigmoid_cross_entropy_with_logits( logits=predicate_score_matrix, labels=gold_predicate_matrix_one_hot) def batch_sequence_matrix_max_sequence_length(batch_sequence_matrix): """Get the longest effective length of the input sequence (excluding padding)""" mask = tf.math.logical_not(tf.math.equal(batch_sequence_matrix, 0)) mask = tf.cast(mask, tf.float32) mask_length = tf.reduce_sum(mask, axis=1) mask_length = tf.cast(mask_length, tf.int32) mask_max_length = tf.reduce_max(mask_length) return mask_max_length mask_max_length = batch_sequence_matrix_max_sequence_length( token_label_ids) predicate_sigmoid_cross_entropy_with_logits = predicate_sigmoid_cross_entropy_with_logits[:, : mask_max_length, : mask_max_length, :] # shape [] predicate_head_select_loss = tf.reduce_sum( predicate_sigmoid_cross_entropy_with_logits) with tf.variable_scope("token_label_loss"): bert_encode_hidden_size = sequence_bert_encode_output.shape[-1].value token_label_output_weight = tf.get_variable( "token_label_output_weights", [num_token_labels, bert_encode_hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) token_label_output_bias = tf.get_variable( "token_label_output_bias", [num_token_labels], initializer=tf.zeros_initializer()) sequence_bert_encode_output = tf.reshape(sequence_bert_encode_output, [-1, bert_encode_hidden_size]) token_label_logits = tf.matmul(sequence_bert_encode_output, token_label_output_weight, transpose_b=True) token_label_logits = tf.nn.bias_add(token_label_logits, token_label_output_bias) token_label_logits = tf.reshape( token_label_logits, [-1, FLAGS.max_seq_length, num_token_labels]) token_label_log_probs = tf.nn.log_softmax(token_label_logits, axis=-1) token_label_one_hot_labels = tf.one_hot(token_label_ids, depth=num_token_labels, dtype=tf.float32) token_label_per_example_loss = -tf.reduce_sum( token_label_one_hot_labels * token_label_log_probs, axis=-1) token_label_loss = tf.reduce_sum(token_label_per_example_loss) token_label_probabilities = tf.nn.softmax(token_label_logits, axis=-1) token_label_predictions = tf.argmax(token_label_probabilities, axis=-1) # return (token_label_loss, token_label_per_example_loss, token_label_logits, token_label_predict) loss = predicate_head_select_loss + token_label_loss return (loss, predicate_head_select_loss, predicate_head_probabilities, predicate_head_predictions, token_label_loss, token_label_per_example_loss, token_label_logits, token_label_predictions)