def cqa_model(final_hidden): final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable( "cls/cqa/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "cls/cqa/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) return (start_logits, end_logits)
def gather_indexes(sequence_tensor, positions): """Gathers the vectors at the specific positions over a minibatch.""" sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) flat_positions = tf.reshape(positions + flat_offsets, [-1]) flat_sequence_tensor = tf.reshape(sequence_tensor, [batch_size * seq_length, width]) output_tensor = tf.gather(flat_sequence_tensor, flat_positions) return output_tensor
def gather_indexes(sequence_tensor, positions): """Gathers the vectors at the specific positions over a minibatch.""" sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank = 3) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] flat_offsets = tf.reshape( tf.range(0, batch_size, dtype = tf.int32) * seq_length, [-1, 1] ) flat_positions = tf.reshape(positions + flat_offsets, [-1]) flat_sequence_tensor = tf.reshape( sequence_tensor, [batch_size * seq_length, width] ) output_tensor = tf.gather(flat_sequence_tensor, flat_positions) return output_tensor
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings, scope): """Creates a classification model.""" with tf.variable_scope('bert', reuse=tf.AUTO_REUSE) as real_scope: model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, scope=scope) final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) return final_hidden[:, 0, :]
def gather_indexes(sequence_tensor, positions): """Gathers the vectors at the specific positions over a minibatch.""" # sequence_tensor, [B, seq_len, hidden_dim],bert最后一层输出 # positions, [B, masked_token_num] sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) flat_positions = tf.reshape(positions + flat_offsets, [-1]) # 为了下面展开后一次性查找 flat_sequence_tensor = tf.reshape(sequence_tensor, [batch_size * seq_length, width]) output_tensor = tf.gather(flat_sequence_tensor, flat_positions) # [B*mask_token_num, hidden_dim] return output_tensor
def get_lm_output(config, input_tensor, output_weights, label_ids, label_mask): """Get loss and log probs for the LM.""" input_shape = modeling.get_shape_list(input_tensor, expected_rank=3) input_tensor = tf.reshape( input_tensor, [input_shape[0] * input_shape[1], input_shape[2]]) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=config.hidden_size, activation=modeling.get_activation(config.hidden_act), kernel_initializer=modeling.create_initializer( config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) label_mask = tf.reshape(label_mask, [input_shape[0] * input_shape[1]]) loss_mask = tf.dtypes.cast(label_mask, tf.float32) per_example_loss = tf.math.multiply(per_example_loss, loss_mask) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, log_probs)
def create_model_start(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights1 = tf.get_variable( "cls/squad/output_weights1", [384, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias1 = tf.get_variable( "cls/squad/output_bias1", [384], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) keep_prob = 1.0 if is_training: keep_prob = 0.9 else: keep_prob = 1.0 logits = tf.matmul(final_hidden_matrix, output_weights1, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias1) logits = tf.nn.relu(logits) logits = tf.nn.dropout(logits, keep_prob) logits = tf.reshape(logits, [batch_size, seq_length, 384]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) s = tf.reduce_sum(unstacked_logits[0:384], 0) return s
def metric_fn(per_example_loss, label_ids, logits, is_real_example): # batch_size * sequence_len shape_list = modeling.get_shape_list(label_ids, expected_rank=2) label_ids = tf.reshape(label_ids, [-1]) is_real_example = tf.tile(is_real_example[:, tf.newaxis], [1, shape_list[1]]) is_real_example = tf.reshape(is_real_example, [-1]) per_example_loss = tf.reshape(per_example_loss, [-1]) logits = tf.reshape(logits, [-1]) predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) accuracy = tf.metrics.accuracy( labels=label_ids, predictions=predictions, weights=is_real_example) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return { "eval_accuracy": accuracy, "eval_loss": loss, }
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) #out1 model. #pdb.set_trace() final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] #out2 output_weights output_weights = tf.get_variable( "cls/squad/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) #out3 output_bias output_bias = tf.get_variable( "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) #out4 logits * 2 logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) #out5 start_logits end_logits (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) return (start_logits, end_logits)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer = model.get_sequence_output() #Pooler前一层输出 final_hidden_shape = modeling.get_shape_list(output_layer, expected_rank=3) hidden_size = final_hidden_shape[-1] output_weight = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02) ) output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer() ) with tf.variable_scope("loss"): if is_training: output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) output_layer_matrix = tf.reshape(output_layer, [-1, hidden_size]) # [batch_size*seq_length, hidden_size] logits = tf.matmul(output_layer_matrix, output_weight, transpose_b=True) # [batch_size*seg_length, num_label] logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, num_labels]) # 最后一层输出[batch_size, seq_length, num_label] # mask = tf.cast(input_mask,tf.float32) # loss = tf.contrib.seq2seq.sequence_loss(logits,labels,mask) # return (loss, logits, predict) ########################################################################## log_probs = tf.nn.log_softmax(logits, axis=-1) # 对输出的softmax归一化 one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) # 每个样本的损失值 loss = tf.reduce_mean(per_example_loss) # 总的损失值 predict = tf.argmax(log_probs, axis=-1) # 预测值 return (loss, per_example_loss, logits, predict)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model.""" #初始化bert模型参数 model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, #输入格式转换后的队列 “” input_mask=input_mask, #屏蔽 token_type_ids=segment_ids, #输入的句子分段ID use_one_hot_embeddings=use_one_hot_embeddings) #从bert模型读取序列输出 final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] #批次数量大小 seq_length = final_hidden_shape[1] #序列长度 hidden_size = final_hidden_shape[2] #隐藏单元大小 #初始化权重矩阵 output_weights = tf.get_variable( "cls/squad/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) #初始化偏置项 output_bias = tf.get_variable("cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) #回归训练,求解y=ax+b ,这种模型结构,a=output_weights,b=output_bias,y=logits logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) #矩阵相乘 logits = tf.nn.bias_add(logits, output_bias) #加上偏置项 logits = tf.reshape( logits, [batch_size, seq_length, 2]) #把输出转换成[每批数量,序列长度,2] 这种格式[2维数组数,行数,列数] logits = tf.transpose(logits, [2, 0, 1]) #对上一步的结果转置,[2,0,1]代表[列数,2维数组数,行数] unstacked_logits = tf.unstack(logits, axis=0) #对矩阵在行上拆分 (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) return (start_logits, end_logits) #返回起始位置,结束位置
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) #self.sequence_output = self.all_encoder_layers[-1] #取最后一层(batch_size,seq_length,hidden_size) final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable( "cls/squad/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) ###如果这里只想做一个填空形式,例如只判断某个词是否为开始位置 ###可以加个全连接层[hidden_size,1],相乘后得到[batch_size, seq_length, 1],sequeeze去掉维度为1的那个维度; ###最终得到[batch_size, seq_length]就可以用多分类判断seq_length的每个位置是否为答案的位置 return (start_logits, end_logits)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a seq labelling model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) with tf.variable_scope("finetune/seq"): #get sequnece output final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] final_hidden = tf.reshape(final_hidden, [batch_size*seq_length, hidden_size]) output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer()) logits = tf.matmul(final_hidden, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits_out = tf.reshape(logits, [batch_size, seq_length, num_labels]) probabilities = tf.nn.softmax(logits_out) log_probs = tf.nn.log_softmax(logits) labels = tf.reshape(labels, [-1]) label_weights = tf.cast(tf.reshape(input_mask, [-1]), dtype=tf.float32) one_hot_labels = tf.one_hot( labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) * label_weights numerator = tf.reduce_sum(per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator/denominator per_example_loss = tf.reshape(per_example_loss, [batch_size, seq_length]) return (loss, per_example_loss, logits_out, probabilities)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config,#たぶんこの設定にしたがってbertを呼び出すということ is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) final_hidden = model.get_sequence_output()#Bertの最終層 final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable( #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す #変数がないときは定義し、ある時はそれを呼び出す "cls/squad/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す #変数がないときは定義し、ある時はそれを呼び出す "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) return (start_logits, end_logits)
def get_new_input_ids(input_ids, masked_lm_probs,masked_lm_positions,masked_lm_weights): #Reshaping the masked_lm_probs sequence_shape = modeling.get_shape_list(masked_lm_positions, expected_rank=2) #for getting the shape info batch_size = sequence_shape[0] seq_length = sequence_shape[1] # Gathering the first masks only. shape = (batch_size,vocab_size) mask_lab_pred = tf.gather(masked_lm_probs, tf.range(0,batch_size*seq_length,delta=seq_length),axis=0) #Getting the indexes for the predictions. shape = (batch_size,1) #mask_lab_pred = tf.random.multinomial(mask_lab_pred,1,output_dtype=tf.int32) mask_lab_pred = tf.reshape(tf.cast(tf.argmax(mask_lab_pred, axis=-1),dtype=tf.int32),(batch_size,1)) # Gathering positions of the first mask. shape=(batch_size,) mask_positions = tf.gather(masked_lm_positions, tf.constant(0,dtype=tf.int32),axis=-1) #Assigning the first masks in input_ids #--First converting mask_positions to one hot, shape=(batch_size,seq_length) mask_positions = tf.one_hot(mask_positions,depth=seq_length,axis=1,dtype=tf.int32) #--Removing those positions there is no mask left mask_positions = tf.cast(tf.reduce_max(masked_lm_positions,axis=-1,keepdims=True)>0 ,dtype=tf.int32)*mask_positions #--Next multiplying mask_positions_one_hot to mask_lab_pred and forming new_ids input_ids = mask_positions*mask_lab_pred+(1-mask_positions)*input_ids #Setting the first mask lm_weights to be zero masked_lm_weights = tf.slice(masked_lm_weights,(0,1),(-1,-1)) masked_lm_weights = tf.concat([masked_lm_weights,tf.zeros(shape=(batch_size,1), dtype=tf.float32)],axis=-1) #Setting the masked_lm_positions first masks to be zero masked_lm_positions = tf.slice(masked_lm_positions,(0,1),(-1,-1)) masked_lm_positions = tf.concat([masked_lm_positions,tf.zeros(shape=(batch_size,1), dtype=tf.int32)],axis=-1) return input_ids,masked_lm_positions,masked_lm_weights
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): final_hidden_matrix = tf.reshape( final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, num_labels]) #logits = tf.transpose(logits, [2,0,1]) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits)
def gather_indexes(sequence_tensor, positions): """Gathers the vectors at the specific positions over a minibatch.""" sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) #list batch_size = sequence_shape[ 0] #'tensorflow.python.framework.ops.Tensor', Tensor("strided_slice_1:0", shape=(), dtype=int32) seq_length = sequence_shape[1] #256 width = sequence_shape[2] #64 flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) #(?, 1) flat_positions = tf.reshape(positions + flat_offsets, [-1]) #(?,) flat_sequence_tensor = tf.reshape(sequence_tensor, [batch_size * seq_length, width]) output_tensor = tf.gather( flat_sequence_tensor, flat_positions ) #Gather slices from params axis axis according to indices. ##!for seq in tf.range(0, batch_size, dtype=tf.int32): ## output_tensor = tf.gather(sequence_tensor[seq], positions) return output_tensor
def gather_indexes( sequence_tensor, positions ): ###sequence_tensor 是transorformer的输出[batch_size,length,hidden_size] ###position是每个case中被mask的位置index,[batch,max_mask_length],默认max_mask_length长度为20 每个实例:[7,10,15,20,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] """Gathers the vectors at the specific positions over a minibatch.""" sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) ###[batch_size,1] flat_positions = tf.reshape(positions + flat_offsets, [-1]) #获得所有mask的position,并flat成一维 flat_sequence_tensor = tf.reshape(sequence_tensor, [batch_size * seq_length, width]) output_tensor = tf.gather(flat_sequence_tensor, flat_positions) ###获取被mask位置对应transformer的输出 return output_tensor
def get_matrix_mask_indices(matrix, num_rows=None): if num_rows is None: num_rows = modeling.get_shape_list(matrix)[0] indices = tf.where(matrix) num_indices = tf.shape(indices)[0] elem_per_row = tf.bincount(tf.cast(indices[:, 0], tf.int32), minlength=num_rows) max_elem_per_row = tf.reduce_max(elem_per_row) row_start = tf.concat([[0], tf.cumsum(elem_per_row[:-1])], axis=0) r = tf.range(max_elem_per_row) idx = tf.expand_dims(row_start, 1) + r idx = tf.minimum(idx, num_indices - 1) result = tf.gather(indices[:, 1], idx) # replace invalid elements with -1 result = tf.where( tf.expand_dims(elem_per_row, 1) > r, result, -tf.ones_like(result)) max_index_per_row = tf.reduce_max(result, axis=1, keepdims=True) max_index_per_row = tf.tile(max_index_per_row, [1, max_elem_per_row]) result = tf.where(result >= 0, result, max_index_per_row) return result
def gather_indexes(sequence_tensor, positions): """Gathers the vectors at the specific positions over a minibatch.""" #这个函数主要是从指定位置的position中(即被mask的位置),从输入batch_size * #seqlength * embedding_size的tensor中,找出来被mask的字符向量。将设batch_size=8, #,sequength_length的长度为128,在这个128的字符中,有20个位置被mask,则本函数就是要找出 #8 * 20 = 160个mask调的字符,每个字符用768的向量表示,则最终输出的结果为160*768 import ipdb ipdb.set_trace() sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] #由于后文要将sequence_tensor进行flat操作,即batch_size*sequence_length操作,所以 #这里要对position进行一个位置的offset(偏置),依次向后偏置sequence_length操作。 flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) flat_positions = tf.reshape(positions + flat_offsets, [-1]) flat_sequence_tensor = tf.reshape(sequence_tensor, [batch_size * seq_length, width]) output_tensor = tf.gather(flat_sequence_tensor, flat_positions) return output_tensor
def knn(labels, embeddings, k, embed_normed=True): # make sure embedding should be l2-normalized if not embed_normed: embeddings = tf.nn.l2_normalize(embeddings, axis=1) embed_shape = modeling.get_shape_list(embeddings) batch_size = embed_shape[0] sim_mat = tf.matmul(embeddings, embeddings, transpose_b=True) sim_mat = sim_mat - tf.eye(batch_size) * 2.0 _, top_k_idx = tf.nn.top_k(sim_mat, k) top_k_labels = tf.squeeze(tf.gather(labels, top_k_idx)) def knn_vote(v): nearest_k_y, idx, votes = tf.unique_with_counts(v) majority_idx = tf.argmax(votes) predict_res = tf.gather(nearest_k_y, majority_idx) return predict_res majority = tf.map_fn(knn_vote, top_k_labels) return majority
def replace_elements_by_indices(old, new, indices): old_shape = modeling.get_shape_list(old) print(old_shape) batch_size = old_shape[0] seq_length = old_shape[1] flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) print(flat_offsets) flat_positions = tf.reshape(indices + flat_offsets, [-1]) print(flat_positions) zeros = tf.zeros(tf.shape(input=flat_positions)[0], dtype=tf.int32) print(zeros) flat_old = tf.reshape(old, [-1]) print(flat_old) masked_lm_mask = tf.compat.v1.sparse_to_dense(flat_positions, tf.shape(input=flat_old), zeros, default_value=1, validate_indices=True, name="masked_lm_mask") print(masked_lm_mask) flat_old_temp = tf.multiply(flat_old, masked_lm_mask) print(flat_old_temp) new_temp = tf.compat.v1.sparse_to_dense(flat_positions, tf.shape(input=flat_old), new, default_value=0, validate_indices=True, name=None) print(new_temp) updated_old = tf.reshape(flat_old_temp + new_temp, old_shape) print(updated_old) return updated_old
def create_model(bert_config, is_training, input_ids, input_len, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a sequence model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_size = num_labels with tf.variable_scope("bert_finetuning"): output_weights = tf.get_variable( "token_output_weights", [output_size, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "token_output_bias", [output_size], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, output_size]) one_hot_labels = tf.one_hot(labels, depth=num_labels, axis=-1, dtype=tf.float32) entropy_loss = tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_labels, logits=logits, dim=-1, name="loss") per_example_loss = tf.reduce_sum(tf.slice(entropy_loss,begin=[0,1],size=[-1,input_len[0]]), axis=-1) loss = tf.reduce_mean(per_example_loss) probs = tf.nn.softmax(logits, axis=-1) return (loss, per_example_loss, probs, logits)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable( "cls/squad/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) return (start_logits, end_logits)
def get_shuffle_loss(model_config, seq_output, label_ids, label_weights): sequence_shape = modeling.get_shape_list(seq_output, expected_rank=[3]) seq_length = sequence_shape[1] width = sequence_shape[2] seq_output = tf.reshape(seq_output, [-1, width]) with tf.variable_scope("cls/shuffle"): with tf.variable_scope("transform"): seq_output = tf.layers.dense( seq_output, units=seq_length, activation=modeling.get_activation(model_config.hidden_act), kernel_initializer=modeling.create_initializer( model_config.initializer_range)) seq_output = modeling.layer_norm(seq_output) output_bias = tf.get_variable("output_bias", shape=[seq_length], initializer=tf.zeros_initializer()) logits = tf.nn.bias_add(seq_output, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(tf.cast(label_weights, tf.float32), [-1]) one_hot_labels = tf.one_hot(label_ids, depth=seq_length, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return loss, per_example_loss, log_probs
def create_model(self, bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, compute_type=tf.float32) final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable( "cls/squad/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape( final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) return logits
def get_masked_lm_output(self, bert_config, input_tensor, output_weights, positions, label_ids, label_weights, trainable): """Get loss and log probs for the masked LM.""" input_tensor = self.gather_indexes(input_tensor, positions) if self.is_negsample: logits_2D = input_tensor label_flat = tf.reshape( label_ids, [-1, 1]) # 1 is the number of positive example num_sampled = int( 0.2 * self.model_para['item_size']) # sample 20% as negatives loss = tf.nn.sampled_softmax_loss(self.softmax_w, self.softmax_b, label_flat, logits_2D, num_sampled, self.model_para['item_size']) else: sequence_shape = modeling.get_shape_list(positions) batch_size = sequence_shape[0] seq_length = sequence_shape[1] residual_channels = input_tensor.get_shape().as_list()[-1] input_tensor = tf.reshape(input_tensor, [-1, seq_length, residual_channels]) logits = ops.conv1d(tf.nn.relu(input_tensor), self.model_para['item_size'], name='logits') logits_2D = tf.reshape(logits, [-1, self.model_para['item_size']]) label_flat = tf.reshape(label_ids, [-1]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=label_flat, logits=logits_2D) loss = tf.reduce_mean(loss) #not sure the impact, 0.001 is empirical value # regularization = 0.001 * tf.reduce_mean([tf.nn.l2_loss(v) for v in tf.trainable_variables()]) # loss=loss+regularization return loss
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) (start_logits, end_logits) = create_model( bert_config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: seq_length = modeling.get_shape_list(input_ids)[1] def compute_loss(logits, positions): #交叉熵损失函数 #把位置数转换成独热标签 one_hot_positions = tf.one_hot( positions, depth=seq_length, dtype=tf.float32) #对softmax结果取ln对数,计算熵的参数,logits部分的信息量 log_probs = tf.nn.log_softmax(logits, axis=-1) #先累加(独热标签乘以每个位的概率),然后取均值 loss = -tf.reduce_mean( tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) #熵的计算公式正确值与(预测值概率对数)相乘的累加 return loss start_positions = features["start_positions"] end_positions = features["end_positions"] #计算答案开始位置的损失 start_loss = compute_loss(start_logits, start_positions) #计算答案结束位置的损失 end_loss = compute_loss(end_logits, end_positions) #总的损失 total_loss = (start_loss + end_loss) / 2.0 #调用模型优化函数 train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.PREDICT: predictions = { "unique_ids": unique_ids, "start_logits": start_logits, "end_logits": end_logits, } output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: raise ValueError( "Only TRAIN and PREDICT modes are supported: %s" % (mode)) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] is_training = (mode == tfes.estimator.ModeKeys.TRAIN) (start_logits, end_logits) = create_model( bert_config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) # tf.logging.info("**** Trainable Variables ****") # for var in tvars: # init_string = "" # if var.name in initialized_variable_names: # init_string = ", *INIT_FROM_CKPT*" # tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, # init_string) output_spec = None if mode == tfes.estimator.ModeKeys.TRAIN: seq_length = modeling.get_shape_list(input_ids)[1] def compute_loss(logits, positions): one_hot_positions = tf.one_hot(positions, depth=seq_length, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_mean( tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) return loss start_positions = features["start_positions"] end_positions = features["end_positions"] start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2.0 train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tfes.estimator.ModeKeys.PREDICT: # outer = tf.matmul(tf.expand_dims(tf.nn.softmax(start_logits), axis=2), # tf.expand_dims(tf.nn.softmax(end_logits), axis=1)) # outer = tf.matrix_band_part(outer, -1, 15) # 取上3角15条对角线,表示答案最大长度只能取到15+1个单词 # yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) # 寻找最大值在L1轴的索引 # yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) predictions = { "unique_ids": unique_ids, "start_logits": start_logits, "end_logits": end_logits, # "yp1": yp1, # "yp2": yp2, } output_spec = tpu.TPUEstimatorSpec(mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and PREDICT modes are supported: %s" % (mode)) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for Estimator.""" if FLAGS.verbose_logging: tf.compat.v1.logging.info("*** Features ***") for name in sorted(features.keys()): tf.compat.v1.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) if not is_training and FLAGS.use_trt: trt_graph = get_frozen_tftrt_model(bert_config, input_ids.shape, use_one_hot_embeddings, init_checkpoint) (start_logits, end_logits) = tf.import_graph_def(trt_graph, input_map={'input_ids':input_ids, 'input_mask':input_mask, 'segment_ids':segment_ids}, return_elements=['unstack:0', 'unstack:1'], name='') predictions = { "unique_ids": unique_ids, "start_logits": start_logits, "end_logits": end_logits, } output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) return output_spec (start_logits, end_logits) = create_model( bert_config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} if init_checkpoint and (hvd is None or hvd.rank() == 0): (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) if FLAGS.verbose_logging: tf.compat.v1.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.compat.v1.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: seq_length = modeling.get_shape_list(input_ids)[1] def compute_loss(logits, positions): one_hot_positions = tf.one_hot( positions, depth=seq_length, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_mean( tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) return loss start_positions = features["start_positions"] end_positions = features["end_positions"] start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2.0 train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, amp, FLAGS.num_accumulation_steps) output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.PREDICT: dummy_op = tf.no_op() # Need to call mixed precision graph rewrite if fp16 to enable graph rewrite if amp: loss_scaler = tf.train.experimental.FixedLossScale(1) dummy_op = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimization.LAMBOptimizer(learning_rate=0.0), loss_scaler) predictions = { "unique_ids": unique_ids, "start_logits": start_logits, "end_logits": end_logits, } output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) else: raise ValueError( "Only TRAIN and PREDICT modes are supported: %s" % (mode)) return output_spec
def create_model(bert_config, is_training, slot_list, features, num_class_labels, use_one_hot_embeddings): """Creates a classification model.""" input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. class_output_layer = model.get_pooled_output() token_output_layer = model.get_sequence_output() token_output_shape = modeling.get_shape_list(token_output_layer, expected_rank=3) batch_size = token_output_shape[0] seq_length = token_output_shape[1] hidden_size = token_output_shape[2] # Define prediction variables class_proj_layer_dim = [hidden_size] for idx in range(FLAGS.num_class_hidden_layer): class_proj_layer_dim.append(64) class_proj_layer_dim.append(num_class_labels) token_proj_layer_dim = [hidden_size] for idx in range(FLAGS.num_token_hidden_layer): token_proj_layer_dim.append(64) token_proj_layer_dim.append(2) if is_training: # I.e., 0.1 dropout class_output_layer = tf.nn.dropout(class_output_layer, keep_prob=(1 - FLAGS.dropout_rate)) token_output_layer = tf.nn.dropout(token_output_layer, keep_prob=(1 - FLAGS.dropout_rate)) total_loss = 0 per_slot_per_example_loss = {} per_slot_class_logits = {} per_slot_start_logits = {} per_slot_end_logits = {} for slot in slot_list: start_pos = features["start_pos_%s" % slot] end_pos = features["end_pos_%s" % slot] class_label_id = features["class_label_id_%s" % slot] slot_scope_name = "slot_%s" % slot if slot == 'price range': slot_scope_name = "slot_price" with tf.variable_scope(slot_scope_name): class_list_output_weights = [] class_list_output_bias = [] for l_idx in range(len(class_proj_layer_dim) - 1): dim_in = class_proj_layer_dim[l_idx] dim_out = class_proj_layer_dim[l_idx + 1] class_list_output_weights.append( tf.get_variable( "class/output_weights_%d" % l_idx, [dim_in, dim_out], initializer=tf.truncated_normal_initializer( stddev=0.02))) class_list_output_bias.append( tf.get_variable("class/output_bias_%d" % l_idx, [dim_out], initializer=tf.zeros_initializer())) token_list_output_weights = [] token_list_output_bias = [] for l_idx in range(len(token_proj_layer_dim) - 1): dim_in = token_proj_layer_dim[l_idx] dim_out = token_proj_layer_dim[l_idx + 1] token_list_output_weights.append( tf.get_variable( "token/output_weights_%d" % l_idx, [dim_in, dim_out], initializer=tf.truncated_normal_initializer( stddev=0.02))) token_list_output_bias.append( tf.get_variable("token/output_bias_%d" % l_idx, [dim_out], initializer=tf.zeros_initializer())) with tf.variable_scope("loss"): class_logits = util.fully_connect_layers( class_output_layer, class_list_output_weights, class_list_output_bias) one_hot_class_labels = tf.one_hot(class_label_id, depth=num_class_labels, dtype=tf.float32) class_loss = tf.losses.softmax_cross_entropy( one_hot_class_labels, class_logits, reduction=tf.losses.Reduction.NONE) token_is_pointable = tf.cast(tf.equal(class_label_id, 2), dtype=tf.float32) token_output_layer = tf.reshape( token_output_layer, [batch_size * seq_length, hidden_size]) token_logits = util.fully_connect_layers( token_output_layer, token_list_output_weights, token_list_output_bias) token_logits = tf.reshape(token_logits, [batch_size, seq_length, 2]) token_logits = tf.transpose(token_logits, [2, 0, 1]) unstacked_token_logits = tf.unstack(token_logits, axis=0) (start_logits, end_logits) = (unstacked_token_logits[0], unstacked_token_logits[1]) def compute_loss(logits, positions): one_hot_positions = tf.one_hot(positions, depth=seq_length, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=1) loss = -tf.reduce_sum(one_hot_positions * log_probs, axis=1) return loss token_loss = ( compute_loss(start_logits, start_pos) + compute_loss(end_logits, end_pos)) / 2.0 # per example if not FLAGS.location_loss_for_nonpointable: token_loss *= token_is_pointable per_example_loss = FLAGS.class_loss_ratio * class_loss + ( 1 - FLAGS.class_loss_ratio) * token_loss total_loss += tf.reduce_sum(per_example_loss) per_slot_per_example_loss[slot] = per_example_loss per_slot_class_logits[slot] = class_logits per_slot_start_logits[slot] = start_logits per_slot_end_logits[slot] = end_logits return (total_loss, per_slot_per_example_loss, per_slot_class_logits, per_slot_start_logits, per_slot_end_logits)
use_one_hot_embeddings=False ) (start_logits, end_logits) = cqa_model(bert_representation) tvars = tf.trainable_variables() initialized_variable_names = {} if FLAGS.init_checkpoint: (assignment_map, initialized_variable_names) = modeling.get_assigment_map_from_checkpoint(tvars, FLAGS.init_checkpoint) tf.train.init_from_checkpoint(FLAGS.init_checkpoint, assignment_map) # compute loss seq_length = modeling.get_shape_list(input_ids)[1] def compute_loss(logits, positions): one_hot_positions = tf.one_hot( positions, depth=seq_length, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_mean(tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) return loss # get the max prob for the predicted start/end position start_probs = tf.nn.softmax(start_logits, axis=-1) start_prob = tf.reduce_max(start_probs, axis=-1) end_probs = tf.nn.softmax(end_logits, axis=-1) end_prob = tf.reduce_max(end_probs, axis=-1) start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions)
def gec_create_model(bert_config, is_training, input_sequence, input_mask, segment_ids, edit_sequence, use_one_hot_embeddings, mode, copy_weight, use_bert_more, insert_ids, multitoken_insert_ids, subtract_replaced_from_replacement): """Creates a classification model.""" # insert_ids: word ids of unigram inserts (list) # multitoken_insert_ids: word_ids of bigram inserts (list of tuples of length 2) # Defining the space of all possible edits: # unk, sos and eos are dummy edits mapped to 0, 1 and 2 respectively # copy is mapped to 3 # del is mapped to 4 num_appends = len(insert_ids) + len(multitoken_insert_ids) num_replaces = num_appends # appends and replacements come from the same set (inserts and multitoken_inserts) append_begin = 5 # First append edit (mapped to 5) append_end = append_begin + num_appends - 1 #Last append edit rep_begin = append_end + 1 # First replace edit rep_end = rep_begin + num_replaces - 1 #Last replace edit num_suffix_transforms = 58 #num of transformation edits num_labels = 5 + num_appends + num_replaces + num_suffix_transforms # total number of edits print("************ num of labels : {} ***************".format(num_labels)) config = bert_config input_sequence_shape = modeling.get_shape_list(input_sequence,2) batch_size = input_sequence_shape[0] seq_len = input_sequence_shape[1] if not use_bert_more: #default use of bert (without logit factorisation) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_sequence, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_sequence_output() else: # LOGIT FACTORISATION is On! model = modified_modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_sequence, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_sequence_output() replace_layer = output_layer[:,seq_len:2*seq_len,:] #representation of replacement slots as described in paper append_layer = output_layer[:,2*seq_len:3*seq_len,:] #representation of append slots as described in paper output_layer = output_layer[:,0:seq_len,:] output_layer_shape = modeling.get_shape_list(output_layer,3) hidden_size = output_layer_shape[-1] flattened_output_layer = tf.reshape(output_layer,[-1, hidden_size]) h_edit = flattened_output_layer if use_bert_more: h_word = flattened_output_layer flattened_replace_layer = tf.reshape(replace_layer,[-1, hidden_size]) flattened_append_layer = tf.reshape(append_layer, [-1, hidden_size]) m_replace = flattened_replace_layer m_append = flattened_append_layer with tf.variable_scope("cls/predictions"): with tf.variable_scope("transform"): h_word = tf.layers.dense( h_word, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) h_word = modeling.layer_norm(h_word) with tf.variable_scope("cls/predictions",reuse=True): with tf.variable_scope("transform",reuse=True): m_replace = tf.layers.dense( m_replace, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) m_replace = modeling.layer_norm(m_replace) with tf.variable_scope("cls/predictions",reuse=True): with tf.variable_scope("transform",reuse=True): m_append = tf.layers.dense( m_append, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) m_append = modeling.layer_norm(m_append) word_embedded_input = model.word_embedded_input flattened_word_embedded_input = tf.reshape(word_embedded_input, [-1, hidden_size]) labels = edit_sequence edit_weights = tf.get_variable( "edit_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) if is_training: h_edit = tf.nn.dropout(h_edit, keep_prob=0.9) if use_bert_more: # append/replace weight vector for a given append or replace operation # correspond to word embedding for its token argument # for multitoken append/replace (e.g. has been) # weight vector is sum of word embeddings of token arguments append_weights = edit_word_embedding_lookup(model.embedding_table, insert_ids, use_one_hot_embeddings, config.vocab_size, config.hidden_size) replace_weights = append_weights #tokens in replace and append vocab are same #(i.e. inserts and multitoken_inserts) multitoken_append_weights = wem_utils.edit_embedding_loopkup(model.embedding_table, multitoken_insert_ids, use_one_hot_embeddings, config.vocab_size, config.hidden_size) multitoken_replace_weights = multitoken_append_weights #tokens in replace and append vocab are same #(i.e. inserts and multitoken_inserts) append_weights = tf.concat([append_weights, multitoken_append_weights],0) replace_weights = tf.concat([replace_weights, multitoken_replace_weights],0) with tf.variable_scope("loss"): edit_logits = tf.matmul(h_edit, edit_weights, transpose_b=True) #first term in eq3 in paper logits = edit_logits if use_bert_more: #=============== inplace_word_logits==============# #2nd term in eq3 in paper inplace_logit = tf.reduce_sum(h_word * flattened_word_embedded_input, axis=1, keepdims=True) #copy #inplace_logit = tf.reduce_sum(m_replace * flattened_word_embedded_input, axis=1, keepdims=True) #copy inplace_logit_appends = tf.tile(inplace_logit,[1,num_appends]) inplace_logit_transforms = tf.tile(inplace_logit,[1,num_suffix_transforms]) zero_3_logits = tf.zeros([batch_size*seq_len,3]) #unk sos eos zero_1_logits = tf.zeros([batch_size*seq_len,1]) # del zero_replace_logits = tf.zeros([batch_size*seq_len,num_replaces]) concat_list = [zero_3_logits, inplace_logit, zero_1_logits]\ + [inplace_logit_appends]\ + [zero_replace_logits]\ + [inplace_logit_transforms] inplace_word_logits = tf.concat(concat_list,1) #======additional (insert,replace) logits ====# #3rd term in eqn3 in paper zero_5_logits = tf.zeros([batch_size*seq_len,5]) append_logits = tf.matmul(m_append, append_weights, transpose_b=True) if subtract_replaced_from_replacement: replace_logits = replacement_minus_replaced_logits(m_replace, flattened_word_embedded_input, replace_weights) else: replace_logits = tf.matmul(m_replace, replace_weights, transpose_b=True) suffix_logits = tf.zeros([batch_size*seq_len,num_suffix_transforms]) concat_list = [zero_5_logits, append_logits, replace_logits, suffix_logits] additional_logits = tf.concat(concat_list,1) #====================================================# logits = edit_logits + inplace_word_logits + additional_logits logits_bias = tf.get_variable("output_bias", shape=[num_labels], initializer=tf.zeros_initializer()) logits += logits_bias logits = tf.reshape(logits, [output_layer_shape[0], output_layer_shape[1], num_labels]) log_probs = tf.nn.log_softmax(logits, axis=-1) probs = tf.nn.softmax(logits,axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) per_token_loss = per_token_loss * tf.to_float(input_mask) mask = copy_weight*tf.to_float(tf.equal(labels,3)) + tf.to_float(tf.not_equal(labels,3)) masked_per_token_loss = per_token_loss * mask per_example_loss = tf.reduce_sum(masked_per_token_loss, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, probs)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config,#たぶんこの設定にしたがってbertを呼び出すということ is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) final_hidden = model.get_sequence_output()#Bertの最終層 final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) #ここをTransformerにする """ output_weights = tf.get_variable( #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す #変数がないときは定義し、ある時はそれを呼び出す "cls/squad/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す #変数がないときは定義し、ある時はそれを呼び出す "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) return (start_logits, end_logits) """ #Transformer層 #bertの中のtransformerよりずっとスペック低くしている transformer_outputs = modeling.transformer_model(input_tensor=final_hidden_matrix, attention_mask=None, hidden_size=5, num_hidden_layers=2, num_attention_heads=2, intermediate_size=20, intermediate_act_fn=modeling.gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False)#現状Falseのみ #線型層 output_weights = tf.get_variable( #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す #変数がないときは定義し、ある時はそれを呼び出す "cls/squad/output_weights", [30000, 5], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す #変数がないときは定義し、ある時はそれを呼び出す "cls/squad/output_bias", [30000], initializer=tf.zeros_initializer()) logits = tf.matmul(transformer_outputs, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) #max ids = tf.reduce_max(logits,axis=0) #Transformerのテンソルとidを出力。損失を測るのに両方使うため return (ids,transformer_outputs)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) (start_logits, end_logits) = create_model( bert_config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: seq_length = modeling.get_shape_list(input_ids)[1] def compute_loss(logits, positions): one_hot_positions = tf.one_hot( positions, depth=seq_length, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_mean( tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) return loss start_positions = features["start_positions"] end_positions = features["end_positions"] start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2.0 train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.PREDICT: predictions = { "unique_ids": unique_ids, "start_logits": start_logits, "end_logits": end_logits, } output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: raise ValueError( "Only TRAIN and PREDICT modes are supported: %s" % (mode)) return output_spec