def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] input_type_ids = features["input_type_ids"] extract_indices = features["extract_indices"] model = modeling.BertModel( config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=input_type_ids, use_one_hot_embeddings=use_one_hot_embeddings) if mode != tf.estimator.ModeKeys.PREDICT: raise ValueError("Only PREDICT modes are supported: %s" % (mode)) tvars = tf.trainable_variables() scaffold_fn = None (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) all_layers = model.get_all_encoder_layers() predictions = { "unique_ids": unique_ids, "extract_indices": extract_indices } for (i, layer_index) in enumerate(layer_indexes): predictions["layer_output_%d" % i] = all_layers[layer_index] output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings, reuse_flag=False): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value with tf.variable_scope("weights", reuse=reuse_flag): output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): #if is_training: # print("###create_model.is_training:",is_training) # output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) def apply_dropout_last_layer(output_layer): output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) return output_layer def not_apply_dropout(output_layer): return output_layer output_layer = tf.cond(is_training, lambda: apply_dropout_last_layer(output_layer), lambda: not_apply_dropout(output_layer)) logits = tf.matmul(output_layer, output_weights, transpose_b=True) print("output_layer:", output_layer.shape, ";output_weights:", output_weights.shape, ";logits:", logits.shape) # shape=(?, 1999) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.sigmoid(logits) #tf.nn.softmax(logits, axis=-1) per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels, logits=logits) # shape=(?, 1999) loss_batch = tf.reduce_sum(per_example_loss, axis=1) # (?,) loss = tf.reduce_mean(loss_batch) # (?,) return loss, per_example_loss, logits, probabilities, model
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, input_span_mask, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable( "cls/squad/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) # apply output mask adder = (1.0 - tf.cast(input_span_mask, tf.float32)) * -10000.0 start_logits += adder end_logits += adder return (start_logits, end_logits)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, probabilities)
def forward(self): model = bert_modeling.BertModel( config=self.bert_config, is_training=self.is_train, input_ids=self.input_ids, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=self.use_one_hot_embeddings) self.tvars = tf.trainable_variables() print(self.init_checkpoint) (self.assignment_map, _) = bert_modeling.get_assigment_map_from_checkpoint( self.tvars, self.init_checkpoint) tf.train.init_from_checkpoint(self.init_checkpoint, self.assignment_map) self.sequence_output_layer = model.get_pooled_output() with tf.variable_scope("output_layer"): self.predict_layer_logits = tf.layers.dense( self.sequence_output_layer, units=14, name="prediction_layer") self.y_pred = tf.nn.softmax(self.predict_layer_logits, name="scores") self.predictions = tf.argmax(self.y_pred, axis=1, name="predictions") print("self.predictions:", self.predictions) with tf.name_scope("loss"): cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=self.predict_layer_logits, labels=self.y_true) self.loss = tf.reduce_mean(cross_entropy, name="loss") with tf.name_scope("accuracy"): correct_predictions = tf.equal(tf.argmax(self.y_pred, 1), tf.argmax(self.y_true, 1)) self.acc = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="acc") with tf.name_scope("optimize"): self.optim = bert_optimization.create_optimizer( self.loss, self.learning_rate, self.num_train_steps, self.num_warmup_steps, False)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_sequence_output() hidden_size = output_layer.shape[-1].value output_weight = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weight, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, ner_params.max_seq_length, ner_params.labels_len]) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_sum(per_example_loss) probabilities = tf.nn.softmax(logits, axis=-1) predict = tf.argmax(probabilities, axis=-1) return (loss, per_example_loss, logits, predict)
def RACL_BERT(self, bert_input_ids, bert_input_mask, bert_segment_ids, position_att): bert_model = bert_modeling.BertModel( config=self.bert_config, is_training=False, input_ids=self.bert_input_ids, input_mask=self.bert_input_mask, token_type_ids=self.bert_segment_ids, use_one_hot_embeddings=False) bert_out = bert_model.get_sequence_output() # Since BERT acts as a shared trainable module by different tasks, we don't need the shared fully-connected layer. inputs = tf.concat( [bert_out[:, 1:, :], tf.expand_dims(bert_out[:, 0, :], 1)], 1) batch_size = tf.shape(inputs)[0] mask256 = tf.tile(tf.expand_dims(self.word_mask, -1), [1, 1, self.opt.filter_num]) mask70 = tf.tile(tf.expand_dims(self.word_mask, 1), [1, self.opt.max_sentence_len, 1]) # Private Feature aspect_input, opinion_input, context_input = list(), list(), list() aspect_prob_list, opinion_prob_list, senti_prob_list = list(), list( ), list() aspect_input.append(inputs) opinion_input.append(inputs) context_input.append(inputs) # We found that the SC task is more difficult than the AE and OE tasks. # Hence, we augment it with a memory-like mechanism by updating the aspect query with the retrieved contexts. # Refer to https://www.aclweb.org/anthology/D16-1021/ for for more details about the memory network. query = list() query.append(inputs) for hop in range(self.opt.hop_num): with tf.variable_scope('layers_{}'.format(hop)): # AE & OE Convolution aspect_conv = tf.layers.conv1d(aspect_input[-1], self.opt.filter_num, self.opt.kernel_size, padding='SAME', activation=tf.nn.relu, name='aspect_conv') opinion_conv = tf.layers.conv1d(opinion_input[-1], self.opt.filter_num, self.opt.kernel_size, padding='SAME', activation=tf.nn.relu, name='opinion_conv') # Relation R1 aspect_see_opinion = tf.matmul( tf.nn.l2_normalize(aspect_conv, -1), tf.nn.l2_normalize(opinion_conv, -1), adjoint_b=True) aspect_att_opinion = softmask_2d(aspect_see_opinion, self.word_mask) aspect_inter = tf.concat( [aspect_conv, tf.matmul(aspect_att_opinion, opinion_conv)], -1) opinion_see_aspect = tf.matmul( tf.nn.l2_normalize(opinion_conv, -1), tf.nn.l2_normalize(aspect_conv, -1), adjoint_b=True) opinion_att_aspect = softmask_2d(opinion_see_aspect, self.word_mask) opinion_inter = tf.concat( [opinion_conv, tf.matmul(opinion_att_aspect, aspect_conv)], -1) # AE & OE Prediction aspect_p = layers.fully_connected( aspect_inter, self.opt.class_num, activation_fn=None, weights_initializer=self.Winit, biases_initializer=self.Winit, scope='aspect_p') opinion_p = layers.fully_connected( opinion_inter, self.opt.class_num, activation_fn=None, weights_initializer=self.Winit, biases_initializer=self.Winit, scope='opinion_p') # OE Confidence # A slight difference from the original paper. # For propagating R3, we calculate the confidence of each candidate opinion word. # Only when a word satisfies the condition Prob[B,I] > Prob[O] in OE, it can be propagated to SC. confidence = tf.maximum( 0., 1 - 2. * tf.nn.softmax(opinion_p, -1)[:, :, 0]) opinion_propagate = tf.tile( tf.expand_dims(confidence, 1), [1, self.opt.max_sentence_len, 1]) * mask70 * position_att # SC Convolution context_conv = tf.layers.conv1d(context_input[-1], self.opt.emb_dim, self.opt.kernel_size, padding='SAME', activation=tf.nn.relu, name='context_conv') # SC Aspect-Context Attention word_see_context = tf.matmul( (query[-1]), tf.nn.l2_normalize(context_conv, -1), adjoint_b=True) * position_att word_att_context = softmask_2d(word_see_context, self.word_mask, scale=True) # Relation R2 & R3 word_att_context += aspect_att_opinion + opinion_propagate context_inter = (query[-1] + tf.matmul(word_att_context, context_conv) ) # query + value query.append(context_inter) # update query # SC Prediction senti_p = layers.fully_connected( context_inter, self.opt.class_num, activation_fn=None, weights_initializer=self.Winit, biases_initializer=self.Winit, scope='senti_p') # Stacking aspect_prob_list.append(tf.expand_dims(aspect_p, -1)) opinion_prob_list.append(tf.expand_dims(opinion_p, -1)) senti_prob_list.append(tf.expand_dims(senti_p, -1)) # We use DropBlock to enhance the learning of the private features for AE & OE & SC. # Refer to http://papers.nips.cc/paper/8271-dropblock-a-regularization-method-for-convolutional-networks for more details. aspect_inter = tf.squeeze( self.drop_block1(inputs=tf.expand_dims(aspect_inter, -1), training=self.is_training), -1) opinion_inter = tf.squeeze( self.drop_block2(inputs=tf.expand_dims(opinion_inter, -1), training=self.is_training), -1) context_conv = tf.squeeze( self.drop_block3(inputs=tf.expand_dims(context_conv, -1), training=self.is_training), -1) aspect_input.append(aspect_inter) opinion_input.append(opinion_inter) context_input.append(context_conv) # Multi-layer Short-cut aspect_prob = tf.reduce_mean(tf.concat(aspect_prob_list, -1), -1) opinion_prob = tf.reduce_mean(tf.concat(opinion_prob_list, -1), -1) sentiment_prob = tf.reduce_mean(tf.concat(senti_prob_list, -1), -1) return aspect_prob, opinion_prob, sentiment_prob