def one_column_cached_transformer(self, decoder_input, cached_layers): hparams = self.hparams current_len = cached_layers[0].shape.as_list()[1] with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE): # Add positional embedding of shape [1, hid_size] pos_embedding, _ = modeling.embedding_lookup( input_ids=tf.constant([current_len]), # [1] vocab_size=hparams.max_premise, # >= premise_len embedding_size=hparams.hidden_size, initializer_range=hparams.initializer_range, word_embedding_name='positional_embedding', ) pos_embedding = tf.reshape(pos_embedding, [1, 1, hparams.hidden_size]) decoder_input = modeling.layer_norm_and_dropout( decoder_input + # [batch, 1, hid_size] pos_embedding, # [1, 1, hid_size] hparams.dropout_prob) # [batch, 1, hid_size] with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE): # In this decoding transformer layer, our tensor can # attend to everything computed so far, including itself # => attention mask of shape: [batch, 1, current_len + 1] batch_size = tf.shape(decoder_input)[0] causal_attention_mask = tf.ones([batch_size, 1, current_len + 1]) all_decoder_layers = modeling.cached_transformer_model( input_vector=decoder_input, cached_layers=cached_layers, attention_mask=causal_attention_mask, hidden_size=hparams.hidden_size, num_hidden_layers=hparams.num_decode_layers, num_attention_heads=hparams.num_attention_heads, intermediate_size=hparams.intermediate_size, intermediate_act_fn=modeling.get_activation( hparams.hidden_act), hidden_dropout_prob=hparams.dropout_prob, attention_probs_dropout_prob=hparams.dropout_prob, initializer_range=hparams.initializer_range, do_return_all_layers=True, attention_top_k=hparams.attention_top_k, densify_attention_mask=hparams.densify_attention_mask) decoder_output = all_decoder_layers[-1] # [batch, 1, hid_size] return decoder_output
def bert_crf(bert_config, is_training, input_ids, segment_ids, input_mask, label_ids, sequence_length, num_labels, use_one_hot_embeddings): batch_size = tf.shape(input_ids)[0] bert_out = bert(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings) # hidden_size = tf.shape(bert_out)[-1] hidden_size = 768 if is_training: bert_out = layer_norm_and_dropout(bert_out, 0.5) else: bert_out = layer_norm(bert_out) bert_out = tf.reshape(bert_out, [-1, hidden_size]) linear_out = linear_layer(bert_out, hidden_size, num_labels, "linear") crf_out = crf_layer(linear_out, label_ids, batch_size, sequence_length, num_labels, max_seq_length, "crf") return crf_out
def bert_blstm_crf(bert_config, is_training, input_ids, segment_ids, input_mask, label_ids, sequence_length, num_labels, use_one_hot_embeddings): """combine bert + blstm + crf_layer :param bert_config: bert_config from model config file :type bert_config: dict :param is_training: train state :type is_training: bool :param input_ids: input text ids for each char :type input_ids: list :param segment_ids: 0 for first sentence and 1 for second sentence, for this task, all is 0, length is max_seq_length :type segment_ids: list :param input_mask: mask for sentence to suit bert model, for this task, all is 1, length is max_seq_length :type input_mask: list :param label_ids: BIO labels ids :type label_ids: list :param sequence_length: sequence length for each input sentence before padding :type sequence_length: list, [lengh_sentence1, 2,..] :param num_labels: nums of BIO labels :type num_labels: int :param use_one_hot_embeddings: wehter use_one_hot_embeddings :type use_one_hot_embeddings: bool :return: total_loss, per_example_loss, logits for ner, pred_ids using viterbi :rtype: tuple """ batch_size = tf.shape(input_ids)[0] bert_out = bert(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings) y_pred = blstm(is_training, bert_out) if is_training: y_pred = layer_norm_and_dropout(y_pred, 0.5) else: bert_out = layer_norm(bert_out) hidden_size = tf.shape(y_pred)[-1] blstm_out = linear_layer(y_pred, hidden_size, num_labels, "linear") crf_out = crf_layer(blstm_out, label_ids, batch_size, sequence_length, num_labels, max_seq_length, "crf") return crf_out
def body(self, features): hparams = self.hparams if not self.is_training: hparams.dropout_prob = 0.0 with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE): # attention_weights: [batch, n_head, from_len, to_len] sequence_output, cls_vector, attention_weights = self.build_encoder( features) if 'targets' not in features: assert self.hparams.dropout_prob == 0.0 logits, losses = self.greedy_decode_8steps(cls_vector, sequence_output) logits.update(attention_weights=attention_weights[:, :, 0, :]) return logits, losses with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE): with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE): premise = features[ 'targets'] # [batch, premise_len=8] -bad naming:( # [batch, premise_len, hid_size] premise_vecs = premise_gather_nd(sequence_output, premise) batch_size = tf.shape(premise)[0] premise_len = premise.shape.as_list()[-1] theorem = features['theorem'] # batch, 1 # [batch, 1, hid_size] and [num_theorems, hid_size] theorem_vec, theorem_emb_table = modeling.embedding_lookup( input_ids=theorem, # [batch, 1] vocab_size=hparams.num_theorems, embedding_size=hparams.hidden_size, initializer_range=hparams.initializer_range, word_embedding_name='theorem_embedding', ) depth = features['depth'] # batch, 1 decoder_input = tf.concat( [ cls_vector, # [batch, 1, hid_size] theorem_vec, # [batch, 1, hid_size] premise_vecs[:, : -1, :] # [batch, premise_len-1, hid_size] ], axis=1) # [batch, premise_len + 1, hid_size] decode_length = decoder_input.shape.as_list()[1] assert decode_length == premise_len + 1 # [decode_length, hid_size] pos_embedding, _ = modeling.embedding_lookup( input_ids=tf.range(decode_length), # [decode_length] vocab_size=hparams.max_premise, # >= premise_len embedding_size=hparams.hidden_size, initializer_range=hparams.initializer_range, word_embedding_name='positional_embedding', ) pos_embedding = tf.reshape( pos_embedding, [1, decode_length, hparams.hidden_size]) decoder_input = modeling.layer_norm_and_dropout( decoder_input + # [batch, decode_length, hid_size] pos_embedding, # [1, decode_length, hid_size] hparams.dropout_prob) # [batch, decode_length, hid_size] with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE): causal_attention_mask = t2t_model.common_layers.ones_matrix_band_part( rows=decode_length, cols=decode_length, num_lower=-1, # attend to everything before num_upper=0, # attend to nothing after out_shape=[1, decode_length, decode_length ]) # 1, decode_length, decode_length # [batch, decode_length, decode_length] causal_attention_mask = tf.tile(causal_attention_mask, [batch_size, 1, 1]) all_decoder_layers = modeling.transformer_model( input_tensor=decoder_input, attention_mask=causal_attention_mask, hidden_size=hparams.hidden_size, num_hidden_layers=hparams.num_decode_layers, num_attention_heads=hparams.num_attention_heads, intermediate_size=hparams.intermediate_size, intermediate_act_fn=modeling.get_activation( hparams.hidden_act), hidden_dropout_prob=hparams.dropout_prob, attention_probs_dropout_prob=hparams.dropout_prob, initializer_range=hparams.initializer_range, do_return_all_layers=True, attention_top_k=hparams.attention_top_k) decoder_output, _ = all_decoder_layers[ -1] # [batch, dec_len, hid_size] theorem_feature = decoder_output[:, 0, :] # [batch, hid_size] premise_feature = decoder_output[:, 1:, :] # [batch, tar_len, hid_size] with tf.variable_scope('prediction', reuse=tf.AUTO_REUSE): theorem_logits = tf.keras.layers.Dense( # [batch, num_theorems] name='theorem', units=hparams.num_theorems, use_bias=True, kernel_initializer=modeling.create_initializer( hparams.initializer_range))(theorem_feature) premise_logits = tf.matmul( a=premise_feature, # [batch, premise_len, hid_size] b=sequence_output, # [batch, sequence_len, hid_size] transpose_b=True, ) # [batch, premise_len, sequence_len] # [batch * premise_len, sequence_len] seq_len = premise_logits.shape.as_list()[-1] premise_logits = tf.reshape(premise_logits, [-1, seq_len]) premise_weights = tf.cast(premise > 0, tf.float32) # [batch, prem_len] premise_weights = tf.reshape(premise_weights, [-1]) # [batch * prem_len] premise = tf.reshape(premise, [-1, 1]) # [batch * prem_len, 1] theorem_loss = tf.losses.sparse_softmax_cross_entropy( labels=theorem, # [batch, 1] logits=theorem_logits # [batch, num_theorems] ) premise_loss = tf.losses.sparse_softmax_cross_entropy( labels=premise, # [batch * premise_len, 1] logits=premise_logits, # [batch * premise_len, sequence_len] weights=premise_weights # [batch * premise_len] ) logits = dict(theorem_logits=theorem_logits, theorem_labels=theorem, premise_logits=premise_logits, premise_labels=premise) losses = dict(training=theorem_loss + premise_loss, theorem_loss=theorem_loss, premise_loss=premise_loss) return logits, losses
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. # output_layer = model.get_pooled_output() output_layer = model.get_sequence_output() # bert后接入bilstm层 # with tf.variable_scope('bilstm'): # cell_fw = tf.contrib.rnn.BasicLSTMCell(512) # cell_bw = tf.contrib.rnn.BasicLSTMCell(512) # lstm_out, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw, cell_bw=cell_bw, # inputs=output_layer, # dtype=tf.float32) # lstm_output = tf.concat(lstm_out, 2) # bert后接cnn+att # output_layer_expand = tf.expand_dims(output_layer, axis=3) # with tf.variable_scope('cnn'): # filter_shape = [3, output_layer.shape[2], 1, 512] # w = tf.get_variable('w', shape=filter_shape, initializer=tf.truncated_normal_initializer()) # b = tf.get_variable('b', shape=[512], initializer=tf.zeros_initializer()) # conv = tf.nn.conv2d(output_layer_expand, w, strides=[1, 1, 1, 1], padding='VALID') # conv = tf.squeeze(conv, axis=2) # conv = modeling.layer_norm_and_dropout(conv, bert_config.hidden_dropout_prob) # # # ttention_mask = modeling.create_attention_mask_from_input_mask(input_ids, input_mask) # att_output = modeling.attention_layer(from_tensor=conv, to_tensor=conv) # att_output = tf.reduce_mean(att_output, axis=1) # att_output = modeling.layer_norm_and_dropout(att_output, bert_config.hidden_dropout_prob) # bert+biattention output_layer_reverse = output_layer[:, ::-1, :] with tf.variable_scope('att_fw'): att_output = modeling.attention_layer(from_tensor=output_layer, to_tensor=output_layer) with tf.variable_scope('att_bw'): att_output_reverse = modeling.attention_layer( from_tensor=output_layer_reverse, to_tensor=output_layer_reverse) att_output_mix = tf.concat([att_output, att_output_reverse], axis=2) with tf.variable_scope('att_final'): att_output_final = modeling.attention_layer(from_tensor=att_output_mix, to_tensor=att_output_mix) att_output_final = modeling.layer_norm_and_dropout( att_output_final, bert_config.hidden_dropout_prob) att_output_final = tf.reduce_mean(att_output_final, axis=1) with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained # last_token_tensor = lstm_output[:, -1, :] last_token_tensor = att_output_final pooled_output = tf.layers.dense( last_token_tensor, bert_config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) #hidden_size = output_layer.shape[-1].value output_layer = pooled_output hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, probabilities)