def _AddMaskedLanguageModelLoss( input_blob, output_weights_blob, positions_blob, label_id_blob, label_weight_blob, seq_length, hidden_size, vocab_size, max_predictions_per_seq, hidden_act, initializer_range, ): with flow.scope.namespace("other"): sum_label_weight_blob = flow.math.reduce_sum(label_weight_blob, axis=[-1]) ones = sum_label_weight_blob * 0.0 + 1.0 sum_label_weight_blob = flow.math.reduce_sum(sum_label_weight_blob) batch_size = flow.math.reduce_sum(ones) sum_label_weight_blob = sum_label_weight_blob / batch_size with flow.scope.namespace("cls-predictions"): input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size) with flow.scope.namespace("transform"): if callable(hidden_act): act_fn = op_conf_util.kNone else: act_fn = hidden_act input_blob = bert_util._FullyConnected( input_blob, input_size=hidden_size, units=hidden_size, activation=act_fn, weight_initializer=bert_util.CreateInitializer(initializer_range), name="dense", ) if callable(hidden_act): input_blob = hidden_act(input_blob) input_blob = bert_util._LayerNorm(input_blob, hidden_size) output_bias = flow.get_variable( name="output_bias", shape=[vocab_size], dtype=input_blob.dtype, initializer=flow.constant_initializer(1.0), ) logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True) logit_blob = flow.nn.bias_add(logit_blob, output_bias) label_id_blob = flow.reshape(label_id_blob, [-1]) pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( logits=logit_blob, labels=label_id_blob ) pre_example_loss = flow.reshape(pre_example_loss, [-1, max_predictions_per_seq]) numerator = pre_example_loss * label_weight_blob with flow.scope.namespace("loss"): numerator = flow.math.reduce_sum(numerator, axis=[-1]) denominator = sum_label_weight_blob + 1e-5 loss = numerator / denominator return loss, pre_example_loss, logit_blob
def PooledOutput(sequence_output, hidden_size, initializer_range): with flow.scope.namespace("bert-pooler"): first_token_tensor = flow.slice(sequence_output, [None, 0, 0], [None, 1, -1]) first_token_tensor = flow.reshape(first_token_tensor, [-1, hidden_size]) pooled_output = bert_util._FullyConnected( first_token_tensor, input_size=hidden_size, units=hidden_size, weight_initializer=bert_util.CreateInitializer(initializer_range), name="dense", ) pooled_output = flow.math.tanh(pooled_output) return pooled_output
def SQuAD( input_ids_blob, input_mask_blob, token_type_ids_blob, vocab_size, seq_length=512, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02, ): backbone = bert_util.BertBackbone( input_ids_blob=input_ids_blob, input_mask_blob=input_mask_blob, token_type_ids_blob=token_type_ids_blob, vocab_size=vocab_size, seq_length=seq_length, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, hidden_act=hidden_act, hidden_dropout_prob=hidden_dropout_prob, attention_probs_dropout_prob=attention_probs_dropout_prob, max_position_embeddings=max_position_embeddings, type_vocab_size=type_vocab_size, initializer_range=initializer_range, ) with flow.scope.namespace("cls-squad"): final_hidden = backbone.sequence_output() final_hidden_matrix = flow.reshape(final_hidden, [-1, hidden_size]) logits = bert_util._FullyConnected( final_hidden_matrix, hidden_size, units=2, weight_initializer=bert_util.CreateInitializer(initializer_range), name='output') logits = flow.reshape(logits, [-1, seq_length, 2]) start_logits = flow.slice(logits, [None, None, 0], [None, None, 1]) end_logits = flow.slice(logits, [None, None, 1], [None, None, 1]) return start_logits, end_logits
def _AddMaskedLanguageModel( input_blob, output_weights_blob, positions_blob, seq_length, hidden_size, vocab_size, hidden_act, initializer_range, ): with flow.scope.namespace("cls-predictions"): # 获取mask词的encode input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size) # 在输出之前添加一个非线性变换,只在预训练阶段起作用 with flow.scope.namespace("transform"): if callable(hidden_act): act_fn = op_conf_util.kNone else: act_fn = hidden_act # print('hhhhh') input_blob = bert_util._FullyConnected( input_blob, input_size=hidden_size, units=hidden_size, activation=act_fn, weight_initializer=bert_util.CreateInitializer( initializer_range), name="dense", ) if callable(hidden_act): input_blob = hidden_act(input_blob) input_blob = bert_util._LayerNorm(input_blob, hidden_size) # output_weights是和传入的word embedding一样的 # 这里再添加一个bias output_bias = flow.get_variable( name="output_bias", shape=[vocab_size], dtype=input_blob.dtype, initializer=flow.constant_initializer(1.0), ) logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True) logit_blob = flow.nn.bias_add(logit_blob, output_bias) return logit_blob