def __init__(self, config: configure_finetuning.FinetuningConfig, tasks, is_training, features, num_train_steps): # Create a shared transformer encoder bert_config = training_utils.get_bert_config(config) self.bert_config = bert_config if config.debug: bert_config.num_hidden_layers = 3 bert_config.hidden_size = 144 bert_config.intermediate_size = 144 * 4 bert_config.num_attention_heads = 4 assert config.max_seq_length <= bert_config.max_position_embeddings bert_model = modeling.BertModel(bert_config=bert_config, is_training=is_training, input_ids=features["input_ids"], input_mask=features["input_mask"], token_type_ids=features["segment_ids"], use_one_hot_embeddings=config.use_tpu, embedding_size=config.embedding_size) percent_done = ( tf.cast(tf.train.get_or_create_global_step(), tf.float32) / tf.cast(num_train_steps, tf.float32)) # Add specific tasks self.outputs = {"task_id": features["task_id"]} losses = [] for task in tasks: with tf.variable_scope("task_specific/" + task.name, reuse=tf.AUTO_REUSE): task_losses, task_outputs = task.get_prediction_module( bert_model, features, is_training, percent_done) grad, = tf.gradients(task_losses, bert_model.token_embeddings) grad = tf.stop_gradient(grad) perturb = self._scale_l2(grad, 0.125) adv_token_embeddings = bert_model.token_embeddings + perturb bert_model_adv = modeling.BertModel( bert_config=bert_config, is_training=is_training, input_ids=features["input_ids"], input_mask=features["input_mask"], token_type_ids=features["segment_ids"], use_one_hot_embeddings=config.use_tpu, embedding_size=config.embedding_size, input_embeddings=adv_token_embeddings) with tf.variable_scope("task_specific/" + task.name, reuse=tf.AUTO_REUSE): task_adv_losses, task_adv_outputs = task.get_prediction_module( bert_model_adv, features, is_training, percent_done) total_loss = 0.875 * task_losses + 0.125 * task_adv_losses losses.append(total_loss) self.outputs[task.name] = task_outputs self.loss = tf.reduce_sum( tf.stack(losses, -1) * tf.one_hot(features["task_id"], len(config.task_names)))
def __init__(self, config: configure_finetuning.FinetuningConfig, tasks, is_training, features, num_train_steps): # Create a shared transformer encoder bert_config = training_utils.get_bert_config(config) self.bert_config = bert_config if config.debug: bert_config.num_hidden_layers = 3 bert_config.hidden_size = 144 bert_config.intermediate_size = 144 * 4 bert_config.num_attention_heads = 4 # multi-choice mrc if any([isinstance(x, qa_tasks.MQATask) for x in tasks]): seq_len = config.max_seq_length assert seq_len <= bert_config.max_position_embeddings bs, total_len = modeling.get_shape_list(features["input_ids"], expected_rank=2) to_shape = [ bs * config.max_options_num * config.evidences_top_k, seq_len ] bert_model = modeling.BertModel( bert_config=bert_config, is_training=is_training, input_ids=tf.reshape(features["input_ids"], to_shape), input_mask=tf.reshape(features["input_mask"], to_shape), token_type_ids=tf.reshape(features["segment_ids"], to_shape), use_one_hot_embeddings=config.use_tpu, embedding_size=config.embedding_size) else: assert config.max_seq_length <= bert_config.max_position_embeddings bert_model = modeling.BertModel( bert_config=bert_config, is_training=is_training, input_ids=features["input_ids"], input_mask=features["input_mask"], token_type_ids=features["segment_ids"], use_one_hot_embeddings=config.use_tpu, embedding_size=config.embedding_size) percent_done = ( tf.cast(tf.train.get_or_create_global_step(), tf.float32) / tf.cast(num_train_steps, tf.float32)) # Add specific tasks self.outputs = {"task_id": features["task_id"]} losses = [] for task in tasks: with tf.variable_scope("task_specific/" + task.name): task_losses, task_outputs = task.get_prediction_module( bert_model, features, is_training, percent_done) losses.append(task_losses) self.outputs[task.name] = task_outputs self.loss = tf.reduce_sum( tf.stack(losses, -1) * tf.one_hot(features["task_id"], len(config.task_names)))
def __init__(self, config: configure_finetuning.FinetuningConfig, tasks, is_training, features, num_train_steps): # Create a shared transformer encoder bert_config = training_utils.get_bert_config(config) self.bert_config = bert_config assert config.max_seq_length <= bert_config.max_position_embeddings bert_model = modeling.BertModel( bert_config=bert_config, is_training=is_training, input_ids=features["input_ids"], input_mask=features["input_mask"], token_type_ids=features["segment_ids"], use_one_hot_embeddings=config.use_tpu, embedding_size=config.embedding_size) percent_done = (tf.cast(tf.train.get_or_create_global_step(), tf.float32) / tf.cast(num_train_steps, tf.float32)) # Add specific tasks self.outputs = {"task_id": features["task_id"]} losses = [] for task in tasks: with tf.variable_scope("task_specific/" + task.name): task_losses, task_outputs = task.get_prediction_module( bert_model, features, is_training, percent_done) losses.append(task_losses) self.outputs[task.name] = task_outputs # sums all the losses? filters only the task id? self.loss = tf.reduce_sum( tf.stack(losses, -1) * tf.one_hot(features["task_id"], len(config.task_names)))
def create_classification_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings, multi_label=False): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer = model.get_sequence_output() hidden_size = output_layer.shape[-1].value sequence_length = output_layer.shape[-2].value W_1 = tf.get_variable('dense_W1', [hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) b_1 = tf.get_variable('dense_b1', [], initializer=tf.zeros_initializer()) W_2 = tf.get_variable('dense_W2', [sequence_length, num_labels], initializer=tf.truncated_normal_initializer(stddev=0.02)) b_2 = tf.get_variable('dense_b2', [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.reduce_sum(tf.multiply(output_layer, W_1), -1) logits = tf.add(logits, b_1) input_mask = tf.cast(input_mask, tf.float32) logits = tf.multiply(logits, input_mask) logits = tf.nn.relu(logits) logits = tf.nn.xw_plus_b(logits, W_2, b_2) if multi_label: probabilities = tf.nn.sigmoid(logits) labels = tf.cast(labels, tf.float32) per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits) else: probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -(one_hot_labels * log_probs) per_example_loss = tf.reduce_sum(per_example_loss, axis=-1) loss = tf.reduce_mean(per_example_loss, name='train_loss') return loss, per_example_loss, logits, probabilities
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] input_type_ids = features["input_type_ids"] model = modeling.BertModel( config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=input_type_ids, use_one_hot_embeddings=use_one_hot_embeddings) if mode != tf.estimator.ModeKeys.PREDICT: raise ValueError("Only PREDICT modes are supported: %s" % (mode)) tvars = tf.trainable_variables() scaffold_fn = None (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) all_layers = model.get_all_encoder_layers() predictions = { "unique_id": unique_ids, } for (i, layer_index) in enumerate(layer_indexes): predictions["layer_output_%d" % i] = all_layers[layer_index] output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def bert_module_fn(is_training): """Spec function for a token embedding module.""" input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids") input_mask = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_mask") token_type = tf.placeholder(shape=[None, None], dtype=tf.int32, name="segment_ids") bert_config = training_utils.get_bert_config(config) model = modeling.BertModel(bert_config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type, use_one_hot_embeddings=use_tpu, embedding_size=config.embedding_size) seq_output = model.sequence_output pool_output = model.pooled_output vocab_file = tf.constant(value=vocab_path, dtype=tf.string, name="vocab_file") lower_case = tf.constant(do_lower_case) tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file) input_map = { "input_ids": input_ids, "input_mask": input_mask, "segment_ids": token_type } output_map = { "pooled_output": pool_output, "sequence_output": seq_output } output_info_map = { "vocab_file": vocab_file, "do_lower_case": lower_case } hub.add_signature(name="tokens", inputs=input_map, outputs=output_map) hub.add_signature(name="tokenization_info", inputs={}, outputs=output_info_map)
def create_sequence_binary_tagging_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Sequence tagging model When num_labels==2, the fine-tuning layers can be simpler than 'create_sequence_tagging_model' """ if num_labels != 2: raise ValueError('num_labels must be 2. If not ,create_sequence_tagging_model should be used.') model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer = model.get_sequence_output() hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "output_bias", [], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.reduce_sum(tf.multiply(output_layer, output_weights), -1) logits = tf.add(logits, output_bias) probabilities = tf.sigmoid(logits) input_mask = tf.cast(input_mask, tf.float32) probabilities = tf.multiply(probabilities, input_mask) labels = tf.cast(labels, dtype=tf.float32) per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits) per_example_loss = tf.multiply(per_example_loss, input_mask) per_example_loss = tf.reduce_sum(per_example_loss, axis=-1) loss = tf.reduce_mean(per_example_loss, name='train_loss') return loss, per_example_loss, logits, probabilities
def create_sequence_tagging_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a sequence tagging model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer = model.get_sequence_output() hidden_size = output_layer.shape[-1].value sequence_length = output_layer.shape[-2].value output_weights = tf.get_variable( "output_weights", [hidden_size, num_labels], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(tf.reshape(output_layer, [-1, hidden_size]), output_weights) # [batch_size*sequence_length, num_labels] logits = tf.reshape(logits, [-1, sequence_length, num_labels]) # [batch_size, sequence_length, num_labels] logits = tf.add(logits, output_bias) probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) input_mask = tf.cast(input_mask, tf.float32) # [batch_size, sequence_length] probabilities = tf.multiply(probabilities, tf.expand_dims(input_mask, axis=-1)) # [batch_size, sequence_length, num_labels] labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) # [batch_size, sequence_length, num_labels] per_example_loss = -tf.multiply(log_probs, labels) # [batch_size, sequence_length, num_labels] per_example_loss = tf.reduce_sum(per_example_loss, axis=-1) # [batch_size, sequence_length] per_example_loss = tf.multiply(per_example_loss, input_mask) per_example_loss = tf.reduce_sum(per_example_loss, axis=-1) # [batch_size] loss = tf.reduce_mean(per_example_loss, name='train_loss') return loss, per_example_loss, logits, probabilities
def _build_transformer(self, inputs: pretrain_data.Inputs, is_training, bert_config=None, name="electra", reuse=False, **kwargs): """Build a transformer encoder network.""" if bert_config is None: bert_config = self._bert_config with tf.variable_scope(tf.get_variable_scope(), reuse=reuse): return modeling.BertModel( bert_config=bert_config, is_training=is_training, input_ids=inputs.input_ids, input_mask=inputs.input_mask, token_type_ids=inputs.segment_ids, use_one_hot_embeddings=self._config.use_tpu, scope=name, **kwargs)
def build_transformer(config: configure_pretraining.PretrainingConfig, inputs: pretrain_data.Inputs, is_training, bert_config, reuse=False, **kwargs): """Build a transformer encoder network.""" with tf.variable_scope(tf.get_variable_scope(), reuse=reuse): return modeling.BertModel(bert_config=bert_config, is_training=is_training, input_ids=inputs.input_ids, input_mask=inputs.input_mask, token_type_ids=inputs.segment_ids, use_one_hot_embeddings=config.use_tpu, **kwargs)
def create_model(self,bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer = model.get_pooled_output() print('output_layer: {}',output_layer.shape) hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, probabilities)
def _build_transformer(self, name, inputs: pretrain_data.Inputs, is_training, use_fp16=False, bert_config=None, **kwargs): """Build a transformer encoder network.""" if bert_config is None: bert_config = self._bert_config return modeling.BertModel(bert_config=bert_config, is_training=is_training, input_ids=inputs.input_ids, input_mask=inputs.input_mask, token_type_ids=inputs.segment_ids, use_one_hot_embeddings=self._config.use_tpu, scope=name, use_fp16=use_fp16, **kwargs)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value return output_layer, hidden_size
def create_model(self): input_ids = BertModelTest.ids_tensor( [self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = BertModelTest.ids_tensor( [self.batch_size, self.seq_length], vocab_size=2) token_type_ids = None if self.use_token_type_ids: token_type_ids = BertModelTest.ids_tensor( [self.batch_size, self.seq_length], self.type_vocab_size) config = modeling.BertConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range) model = modeling.BertModel(config=config, is_training=self.is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, scope=self.scope) outputs = { "embedding_output": model.get_embedding_output(), "sequence_output": model.get_sequence_output(), "pooled_output": model.get_pooled_output(), "all_encoder_layers": model.get_all_encoder_layers(), } return outputs
def main(): tf.set_random_seed(1234) np.random.seed(0) batch_size = 1 tf_datatype = tf.int32 np_datatype = np.int32 iterations = 10 features_ph = {} features_ph["input_ids"] = tf.placeholder(dtype=tf_datatype, shape=[batch_size, 128], name="input_ids") features_ph["input_mask"] = tf.placeholder(dtype=tf_datatype, shape=[batch_size, 128], name="input_mask") features_ph["token_type_ids"] = tf.placeholder(dtype=tf_datatype, shape=[batch_size, 128], name="token_type_ids") features_data = {} features_data["input_ids"] = np.random.rand(batch_size, 128).astype(np_datatype) features_data["input_mask"] = np.random.rand(batch_size, 128).astype(np_datatype) features_data["token_type_ids"] = np.random.rand( batch_size, 128).astype(np_datatype) features_feed_dict = { features_ph[key]: features_data[key] for key in features_ph } finetuning_config = configure_finetuning.FinetuningConfig("ConvBert", "./") bert_config = training_utils.get_bert_config(finetuning_config) bert_model = modeling.BertModel( bert_config=bert_config, is_training=False, input_ids=features_ph["input_ids"], input_mask=features_ph["input_mask"], token_type_ids=features_ph["token_type_ids"]) #outputs_names = "discriminator_predictions/Sigmoid:0,discriminator_predictions/truediv:0,discriminator_predictions/Cast_2:0,discriminator_predictions/truediv_1:0" graph_outputs = bert_model.get_sequence_output() outputs_names = graph_outputs.name print("graph output: ", graph_outputs) run_op_list = [] outputs_names_with_port = outputs_names.split(",") outputs_names_without_port = [ name.split(":")[0] for name in outputs_names_with_port ] for index in range(len(outputs_names_without_port)): run_op_list.append(outputs_names_without_port[index]) inputs_names_with_port = [features_ph[key].name for key in features_ph] # define saver #saver = tf.train.Saver(var_list=tf.trainable_variables()) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) for i in range(iterations): sess.run(run_op_list, feed_dict=features_feed_dict) tf_time_sum = 0 a = datetime.now() for i in range(iterations): tf_result = sess.run(run_op_list, feed_dict=features_feed_dict) b = datetime.now() tf_time_sum = (b - a).total_seconds() tf_time = "[INFO] TF execution time: " + str( tf_time_sum * 1000 / iterations) + " ms" # tf_result = tf_result.flatten() frozen_graph = tf.graph_util.convert_variables_to_constants( sess, sess.graph_def, outputs_names_without_port) # frozen_graph = tf.graph_util.remove_training_nodes(frozen_graph) # save frozen model with open("ConvBert.pb", "wb") as ofile: ofile.write(frozen_graph.SerializeToString()) # tf.reset_default_graph() # tf.import_graph_def(frozen_graph, name='') # #with tf.Session(config=config) as sess: # sess = tf.Session(config=config) # graph_def = tf_optimize(inputs_names_with_port, outputs_names_without_port, # sess.graph_def, True) # with open("ConvBert_optimized_model.pb", "wb") as ofile: # ofile.write(graph_def.SerializeToString()) onnx_model_file = "ConvBert.onnx" command = "python3 -m tf2onnx.convert --input ConvBert.pb --output %s --fold_const --opset 12 --verbose" % onnx_model_file command += " --inputs " for name in inputs_names_with_port: command += "%s," % name command = command[:-1] + " --outputs " for name in outputs_names_with_port: command += "%s," % name command = command[:-1] os.system(command) print(command) #exit(0) command = "trtexec - -onnx = ConvBert.onnx - -verbose" os.system(command) print(command)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] next_sentence_labels = features["next_sentence_labels"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (next_sentence_loss, next_sentence_example_loss, next_sentence_log_probs) = get_next_sentence_output( bert_config, model.get_pooled_output(), next_sentence_labels) total_loss = masked_lm_loss + next_sentence_loss tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape(masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax( masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) next_sentence_log_probs = tf.reshape( next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) next_sentence_predictions = tf.argmax( next_sentence_log_probs, axis=-1, output_type=tf.int32) next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) next_sentence_accuracy = tf.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions) next_sentence_mean_loss = tf.metrics.mean( values=next_sentence_example_loss) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, "next_sentence_accuracy": next_sentence_accuracy, "next_sentence_loss": next_sentence_mean_loss, } eval_metrics = (metric_fn, [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels ]) output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) return output_spec
def create_model(self): self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, 10, 4], name='input_y') self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None, n_sub, 4], name='input_y2') self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob') self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob') self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, 190], name='input_ids') self.mask_ids = tf.placeholder(dtype=tf.int32, shape=[None, 190], name='mask_ids') self.type_ids = tf.placeholder(dtype=tf.int32, shape=[None, 190], name='type_ids') self.is_training = tf.placeholder(dtype=tf.bool, name='is_training') # bert_hidden_size = bert_output_layer.shape[-1].value # hidden_size = output_layer.shape[-1].value if self.main_feature.lower() in ['word', 'char']: self.input_x = tf.placeholder(dtype=tf.int32, shape=[None, self.max_len], name='input_x') self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding') self.word_encoding = tf.nn.embedding_lookup( self.embedding, self.input_x) self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new elif self.main_feature.lower() in [ 'elmo_word', 'elmo_char', 'elmo_qiuqiu' ]: self.input_x = tf.placeholder(dtype=tf.int32, shape=[None, self.max_len + 2], name='input_x') if self.main_feature == 'elmo_word': options_file = self.config.elmo_word_options_file weight_file = self.config.elmo_word_weight_file embed_file = self.config.elmo_word_embed_file elif self.main_feature == 'elmo_char': options_file = self.config.elmo_char_options_file weight_file = self.config.elmo_char_weight_file embed_file = self.config.elmo_char_embed_file elif self.main_feature == 'elmo_qiuqiu': options_file = self.config.elmo_qiuqiu_options_file weight_file = self.config.elmo_qiuqiu_weight_file embed_file = self.config.elmo_qiuqiu_embed_file self.bilm = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=embed_file, max_batch_size=self.batch_size) bilm_embedding_op = self.bilm(self.input_x) bilm_embedding = weight_layers('output', bilm_embedding_op, l2_coef=0.0) self.word_encoding = bilm_embedding['weighted_op'] self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new else: exit('wrong feature') self.layer_embedding = tf.get_variable(shape=[10, self.hidden_dim], name='layer_embedding') self.forward = self.LSTM() self.backwad = self.LSTM() # self.forward2 = self.LSTM() # self.backwad2 = self.LSTM() # add point self.forward2 = self.GRU() self.backwad2 = self.GRU() # bert使用 bert_config = modeling.BertConfig.from_json_file( self.config.BERT_CONFIG_FILES) bert_model = modeling.BertModel(config=bert_config, is_training=self.is_training, input_ids=self.input_ids, input_mask=self.mask_ids, token_type_ids=self.type_ids) if self.is_training is not None: print('bert config hidden dropout -- ---', bert_config.hidden_dropout_prob) print('bert config hidden dropout -- ---', bert_config.attention_probs_dropout_prob) self.word_encoding = bert_model.get_sequence_output() all_layer_output = bert_model.get_all_encoder_layers() self.word_encoding = (all_layer_output[0] + all_layer_output[1] + all_layer_output[2] + all_layer_output[3]) / 4 with tf.variable_scope('sentence_encode'): all_output_words, _ = tf.nn.bidirectional_dynamic_rnn( self.forward, self.backwad, self.word_encoding, dtype=tf.float32) # output_sentence = 0.5*(all_output_words[0] + all_output_words[1]) output_sentence = tf.concat(axis=2, values=all_output_words) with tf.variable_scope('sentence_encode2'): all_output_words, _ = tf.nn.bidirectional_dynamic_rnn( self.forward2, self.backwad2, output_sentence, dtype=tf.float32) # output_sentence = 0.5*(all_output_words[0] + all_output_words[1]) output_sentence = tf.concat(axis=2, values=all_output_words) output_sentence = tf.layers.dense(output_sentence, self.hidden_dim, activation=tf.nn.tanh) sentence_reshape = tf.reshape(output_sentence, [-1, 1, self.max_len, self.hidden_dim]) sentence_reshape_tile = tf.tile(sentence_reshape, [1, 10, 1, 1]) # 句子复制10份 layer_reshape = tf.reshape(self.layer_embedding, [1, 10, 1, self.hidden_dim]) layer_reshape_tile = tf.tile(layer_reshape, [self.batch_size, 1, self.max_len, 1]) embed_concat = tf.reshape( tf.concat(axis=3, values=[sentence_reshape_tile, layer_reshape_tile]), [-1, 2 * self.hidden_dim]) self.att_w = tf.get_variable( shape=[2 * self.hidden_dim, self.hidden_dim], name='att_w') self.att_b = tf.get_variable(shape=[self.hidden_dim], name='att_b') self.att_v = tf.get_variable(shape=[self.hidden_dim, 1], name='att_v') score = tf.reshape( tf.matmul( tf.nn.tanh(tf.matmul(embed_concat, self.att_w) + self.att_b), self.att_v), [-1, 10, self.max_len]) alpah = tf.nn.softmax(score, axis=2) layer_sentence = tf.matmul(alpah, output_sentence) layer_reshape2 = tf.reshape(self.layer_embedding, [1, 10, self.hidden_dim]) layer_reshape2_tile = tf.tile(layer_reshape2, [self.batch_size, 1, 1]) layer_sentence = tf.concat( axis=2, values=[layer_sentence, layer_reshape2_tile]) layer_sentence = tf.reshape(layer_sentence, [-1, 2 * self.hidden_dim]) layer_sentence = tf.layers.dense(layer_sentence, self.hidden_dim, activation=tf.nn.relu) # add point layer_sentence = tf.nn.dropout(layer_sentence, self.dropout_keep_prob) self.logits = tf.layers.dense(layer_sentence, 4, activation=None) y_ = tf.nn.softmax(self.logits, axis=1) self.prob = tf.reshape(y_, [-1, 10, 4]) self.prediction = tf.argmax(self.prob, 2, name="prediction") if not self.config.balance: self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape( self.input_y, [-1, 4]))) self.loss += tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape( self.input_y2, [-1, 4]))) else: # class0_weight = 0.882 * self.n_classes # 第0类的权重系数 # class1_weight = 0.019 * self.n_classes # 第1类的权重系数 # class2_weight = 0.080 * self.n_classes # 第2类的权重系数 # class3_weight = 0.019 * self.n_classes # 第3类的权重系数 class0_weight = 1 # 第0类的权重系数 class1_weight = 3 # 第1类的权重系数 class2_weight = 3 # 第2类的权重系数 class3_weight = 3 # 第3类的权重系数 # coe = tf.constant([1., 1., 1., 1.]) # y = tf.reshape(self.input_y, [-1, 4]) * coe # self.loss = -tf.reduce_mean(y * tf.log(y_)) y = tf.reshape(self.input_y, [-1, 4]) self.loss = tf.reduce_mean(-class0_weight * (y[:, 0] * tf.log(y_[:, 0])) - class1_weight * (y[:, 1] * tf.log(y_[:, 1])) - class2_weight * (y[:, 2] * tf.log(y_[:, 2])) - class3_weight * (y[:, 3] * tf.log(y_[:, 3]))) # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2])) return self