def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] input_type_ids = features["input_type_ids"] extract_indices = features["extract_indices"] model = modeling.AlbertModel( config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=input_type_ids, use_one_hot_embeddings=use_one_hot_embeddings) if mode != tf.estimator.ModeKeys.PREDICT: raise ValueError("Only PREDICT modes are supported: %s" % (mode)) tvars = tf.trainable_variables() scaffold_fn = None (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) all_layers = model.get_all_encoder_layers() predictions = { "unique_ids": unique_ids, "extract_indices": extract_indices } for (i, layer_index) in enumerate(layer_indexes): predictions["layer_output_%d" % i] = all_layers[layer_index] output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def create_model(albert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings, task_name): """Creates a classification model.""" model = modeling.AlbertModel(config=albert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, rate=0.1) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) if task_name != "sts-b": probabilities = tf.nn.softmax(logits, axis=-1) predictions = tf.argmax(probabilities, axis=-1, output_type=tf.int32) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) else: probabilities = logits logits = tf.squeeze(logits, [-1]) predictions = logits per_example_loss = tf.square(logits - labels) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, probabilities, logits, predictions)
def _create_model_from_scratch(albert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates an ALBERT model from scratch (as opposed to hub).""" model = modeling.AlbertModel(config=albert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_pooled_output() return output_layer
def _create_model_from_scratch(albert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings, use_einsum): """Creates an ALBERT model from scratch/config.""" model = modeling.AlbertModel(config=albert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, use_einsum=use_einsum) return (model.get_pooled_output(), model.get_sequence_output())
def bert_embedding(self): # load bert embedding albert_config = modeling.AlbertConfig.from_json_file( self.config.bert_config_path) # 配置文件地址。 model = modeling.AlbertModel(config=albert_config, is_training=True, input_ids=self.input_ids, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=False) embedding = model.get_sequence_output() return embedding
def create_model( albert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, num_choices, use_one_hot_embeddings): """Creates a classification model.""" output_layer = modeling.AlbertModel( config=albert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings ).get_pooled_output() hidden_size = output_layer.shape[-1].value print('HIDDEN: ', hidden_size) softmax_weights = tf.get_variable( "softmax_weights", [hidden_size, 1], initializer=tf.truncated_normal_initializer(stddev=0.02)) print('WEIGHT: ', softmax_weights.shape) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, softmax_weights) print('LOGITS: ', logits.shape) logits = tf.reshape(logits, (-1, num_choices)) print(logits.shape) probabilities = tf.nn.softmax(logits, axis=-1) predictions = tf.argmax(probabilities, axis=-1, output_type=tf.int32) log_probs = tf.nn.log_softmax(logits, axis=-1) print('PROB: ', log_probs.shape) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) print('ONE: ', one_hot_labels.shape) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, probabilities, logits, predictions)
def module_fn(is_training): """Module function.""" input_ids = tf.placeholder(tf.int32, [None, None], "input_ids") input_mask = tf.placeholder(tf.int32, [None, None], "input_mask") segment_ids = tf.placeholder(tf.int32, [None, None], "segment_ids") mlm_positions = tf.placeholder(tf.int32, [None, None], "mlm_positions") albert_config = modeling.AlbertConfig.from_json_file( os.path.join(FLAGS.albert_directory, "albert_config.json")) model = modeling.AlbertModel(config=albert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False) mlm_logits = get_mlm_logits(model, albert_config, mlm_positions) assert tf.gfile.Exists(FLAGS.vocab_path) vocab_file = tf.constant(value=FLAGS.vocab_path, dtype=tf.string, name="vocab_file") # By adding `vocab_file` to the ASSET_FILEPATHS collection, TF-Hub will # rewrite this tensor so that this asset is portable. tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file) hub.add_signature(name="tokens", inputs=dict(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids), outputs=dict(sequence_output=model.get_sequence_output(), pooled_output=model.get_pooled_output())) hub.add_signature(name="mlm", inputs=dict(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, mlm_positions=mlm_positions), outputs=dict(sequence_output=model.get_sequence_output(), pooled_output=model.get_pooled_output(), mlm_logits=mlm_logits)) hub.add_signature(name="tokenization_info", inputs={}, outputs=dict(vocab_file=vocab_file, do_lower_case=tf.constant( FLAGS.do_lower_case)))
def build_model(sess): """Module function.""" input_ids = tf.placeholder(tf.int32, [None, None], "input_ids") input_mask = tf.placeholder(tf.int32, [None, None], "input_mask") segment_ids = tf.placeholder(tf.int32, [None, None], "segment_ids") mlm_positions = tf.placeholder(tf.int32, [None, None], "mlm_positions") albert_config_path = os.path.join(FLAGS.albert_directory, "albert_config.json") albert_config = modeling.AlbertConfig.from_json_file(albert_config_path) model = modeling.AlbertModel( config=albert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False, ) get_mlm_logits( model.get_sequence_output(), albert_config, mlm_positions, model.get_embedding_table(), ) get_sentence_order_logits(model.get_pooled_output(), albert_config) checkpoint_path = os.path.join(FLAGS.albert_directory, FLAGS.checkpoint_name) tvars = tf.trainable_variables() ( assignment_map, initialized_variable_names, ) = modeling.get_assignment_map_from_checkpoint(tvars, checkpoint_path) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) tf.train.init_from_checkpoint(checkpoint_path, assignment_map) init = tf.global_variables_initializer() sess.run(init) return sess
def create_model(self): input_ids = AlbertModelTest.ids_tensor( [self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = AlbertModelTest.ids_tensor( [self.batch_size, self.seq_length], vocab_size=2) token_type_ids = None if self.use_token_type_ids: token_type_ids = AlbertModelTest.ids_tensor( [self.batch_size, self.seq_length], self.type_vocab_size) config = modeling.AlbertConfig( vocab_size=self.vocab_size, embedding_size=self.embedding_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, ) model = modeling.AlbertModel( config=config, is_training=self.is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, scope=self.scope, ) outputs = { "embedding_output": model.get_embedding_output(), "sequence_output": model.get_sequence_output(), "pooled_output": model.get_pooled_output(), "all_encoder_layers": model.get_all_encoder_layers(), } return outputs
def __init__(self, albert_config, num_labels, seq_length, init_checkpoint): self.albert_config = albert_config self.num_labels = num_labels self.seq_length = seq_length self.input_ids = tf.placeholder(tf.int32, [None, self.seq_length], name='input_ids') self.input_mask = tf.placeholder(tf.int32, [None, self.seq_length], name='input_mask') self.segment_ids = tf.placeholder(tf.int32, [None, self.seq_length], name='segment_ids') self.labels = tf.placeholder(tf.int32, [None], name='labels') self.is_training = tf.placeholder(tf.bool, shape=[], name='is_training') #self.learning_rate = tf.placeholder(tf.float32, shape=[], name='learn_rate') self.model = modeling.AlbertModel(config=self.albert_config, is_training=self.is_training, input_ids=self.input_ids, input_mask=self.input_mask, token_type_ids=self.segment_ids) tvars = tf.trainable_variables() initialized_variable_names = {} if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) self.inference()
def create_model(albert_config, is_training, input_ids, input_mask, segment_ids, input_cdc_ids, age, sex_ids, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" if not FLAGS.cdc_only: model = modeling.AlbertModel( config=albert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. if FLAGS.use_pooled_output: tf.logging.info("using pooled output") output_albert_layer = model.get_pooled_output() else: tf.logging.info("using meaned output") output_albert_layer = tf.reduce_mean(model.get_sequence_output(), axis=1) with tf.variable_scope('cdc'): with tf.variable_scope("embedding"): embedding_table = tf.get_variable( name="embedding_table", shape=[FLAGS.cdc_vocab_size, FLAGS.cdc_embedding_size], initializer=modeling.create_initializer()) embedded = tf.nn.embedding_lookup(embedding_table, input_cdc_ids) mask = tf.not_equal(input_cdc_ids, 0) embed_average = tf.keras.layers.GlobalAveragePooling1D()(embedded, mask) embed_max = tf.keras.layers.GlobalMaxPooling1D()(embedded) concat_max_average = tf.concat([embed_average, embed_max], axis=-1) # concat_sex_age = tf.concat([average, age, sex_ids], axis=-1) # # with tf.variable_scope("dense_1"): # input_size = concat_sex_age.shape[-1].value # output_size = 2 * FLAGS.cdc_embedding_size # # W = tf.get_variable(name="kernel", # shape=[input_size, output_size], # initializer=modeling.create_initializer()) # b = tf.get_variable(name="bias", # shape=[output_size], # initializer=tf.zeros_initializer) # dense_1 = tf.matmul(concat_sex_age, W) # dense_1 = tf.nn.bias_add(dense_1, b) # dense_1 = tf.nn.relu(dense_1) # # with tf.variable_scope("dense_2"): # input_size = dense_1.shape[-1].value # output_size = FLAGS.cdc_embedding_size # W = tf.get_variable(name="kernel", # shape=[input_size, output_size], # initializer=modeling.create_initializer()) # b = tf.get_variable(name="bias", # shape=[output_size], # initializer=tf.zeros_initializer) # dense_2 = tf.matmul(dense_1, W) # dense_2 = tf.nn.bias_add(dense_2, b) # dense_2 = tf.nn.relu(dense_2) output_cdc_layer = tf.concat([age, sex_ids, concat_max_average], axis=-1) # Concatenate the output_layer with other features if FLAGS.cdc_only: output_layer = output_cdc_layer else: output_layer = tf.concat([output_albert_layer, output_cdc_layer], axis=-1) hidden_size = output_layer.shape[-1].value with tf.variable_scope("output"): output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, probabilities, predictions)
def __init__(self, bert_config, tokenizer): _graph = tf.Graph() with _graph.as_default(): self.X = tf.placeholder(tf.int32, [None, None]) self.top_p = tf.placeholder(tf.float32, None) self.top_k = tf.placeholder(tf.int32, None) self.k = tf.placeholder(tf.int32, None) self.temperature = tf.placeholder(tf.float32, None) self.indices = tf.placeholder(tf.int32, [None, None]) self.MASK = tf.placeholder(tf.int32, [None, None]) self._tokenizer = tokenizer self.model = modeling.AlbertModel( config=bert_config, is_training=False, input_ids=self.X, input_mask=self.MASK, use_one_hot_embeddings=False, ) self.logits = self.model.get_pooled_output() input_tensor = self.model.get_sequence_output() output_weights = self.model.get_embedding_table() with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( input_tensor, units=bert_config.embedding_size, activation=modeling.get_activation( bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) self._logits = tf.nn.bias_add(logits, output_bias) self._log_softmax = tf.nn.log_softmax(self._logits, axis=-1) logits = tf.gather_nd(self._logits, self.indices) logits = logits / self.temperature def necleus(): return top_p_logits(logits, self.top_p) def select_k(): return top_k_logits(logits, self.top_k) logits = tf.cond(self.top_p > 0, necleus, select_k) self.samples = tf.multinomial(logits, num_samples=self.k, output_dtype=tf.int32) self._sess = tf.InteractiveSession() self._sess.run(tf.global_variables_initializer()) var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='bert') cls = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='cls') self._saver = tf.train.Saver(var_list=var_lists + cls) attns = _extract_attention_weights(bert_config.num_hidden_layers, tf.get_default_graph()) self.attns = attns
def __init__(self): self.config = Config() # 配置参数 # Placeholder self.input_ids = tf.placeholder(tf.int32, shape=[None, self.config.seq_length], name='input_ids') self.input_masks = tf.placeholder(tf.int32, shape=[None, self.config.seq_length], name='input_masks') self.segment_ids = tf.placeholder(tf.int32, shape=[None, self.config.seq_length], name='segment_ids') self.label_ids = tf.placeholder(tf.int32, shape=[None, self.config.seq_length], name='label_ids') self.input_length = tf.placeholder(shape=[None], dtype=tf.int32, name='input-length') # 输入文本的长度 self.input_keep_prob = tf.placeholder( dtype=tf.float32, name='input-keep-prob') # keep-prob # 加载Albert配置参数 bert_config = modeling.AlbertConfig.from_json_file( self.config.bert_config_file) # 加载Albert网络结构 self.model = modeling.AlbertModel(config=bert_config, is_training=self.config.is_training, input_ids=self.input_ids, input_mask=self.input_masks, token_type_ids=self.segment_ids, use_one_hot_embeddings=False) # 使用预训练的参数赋值给上步加载的网络结构中 tvars = tf.trainable_variables() assignment_map, initialized_variable_names = modeling.get_assignment_map_from_checkpoint( tvars, self.config.initial_checkpoint) tf.train.init_from_checkpoint(self.config.initial_checkpoint, assignment_map=assignment_map) # 去序列输出(字向量) dim:(batch_size, seq_length, 384) self.sequence_output = self.model.get_sequence_output() if self.config.is_bilstm: # 是否使用Bi-LSTM层 # Bi-LSTM/Bi-GRU cell_fw = self.get_rnn(self.config.rnn_type) # 前向cell cell_bw = self.get_rnn(self.config.rnn_type) # 后向cell outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=self.sequence_output, dtype=tf.float32) outputs = tf.concat( values=outputs, axis=2 ) # 将前向cell和后向cell的结果进行concat拼接 dim:(batch_size, max_length, 2*hidden_dim) outputs = tf.layers.dropout(inputs=outputs, rate=self.input_keep_prob) else: outputs = self.sequence_output # 输出层 dim:(batch_size, max_length, num_classes) self.logits = tf.layers.dense(inputs=outputs, units=self.config.num_classes, name='logits') # 是否使用CRF层 if self.config.crf: log_likelihood, self.transition_params = crf.crf_log_likelihood( inputs=self.logits, tag_indices=self.label_ids, sequence_lengths=self.input_length) self.loss = -tf.reduce_mean(log_likelihood) # 结果输出 self.predict, self.viterbi_score = crf.crf_decode( potentials=self.logits, transition_params=self.transition_params, sequence_length=self.input_length) else: # 损失函数,交叉熵 cross_entropy = tf.nn.softmax_cross_entropy_with_logits( labels=self.label_ids, logits=self.logits) mask = tf.sequence_mask(lengths=self.input_length) losses = tf.boolean_mask(cross_entropy, mask=mask) self.loss = tf.reduce_mean(losses) # 结果输出 self.predict = tf.argmax(tf.nn.softmax(self.logits), axis=1, name='predict') # 优化器 self.optimizer = tf.train.AdamOptimizer( learning_rate=self.config.learning_rate).minimize(loss=self.loss)
def __init__(self, albert_config, num_labels, seq_length, init_checkpoint): self.albert_config = albert_config self.num_labels = num_labels self.seq_length = seq_length self.tower_grads = [] self.losses = [] self.input_ids = tf.placeholder(tf.int32, [None, self.seq_length], name='input_ids') self.input_mask = tf.placeholder(tf.int32, [None, self.seq_length], name='input_mask') self.segment_ids = tf.placeholder(tf.int32, [None, self.seq_length], name='segment_ids') self.labels = tf.placeholder(tf.int32, [None], name='labels') self.batch_size = tf.placeholder(tf.int32, shape=[], name='batch_size') self.is_training = tf.placeholder(tf.bool, shape=[], name='is_training') print(self.batch_size) self.gpu_step = self.batch_size // gpu_nums global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) optimizer = optimization.AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) with tf.variable_scope(tf.get_variable_scope()) as outer_scope: pred = [] label = [] for d in range(gpu_nums): with tf.device("/gpu:%s" % d), tf.name_scope("%s_%s" % ("tower", d)): self.model = modeling.AlbertModel( config=self.albert_config, is_training=self.is_training, input_ids=self.input_ids[d * self.gpu_step:(d + 1) * self.gpu_step], input_mask=self.input_mask[d * self.gpu_step:(d + 1) * self.gpu_step], token_type_ids=self.segment_ids[d * self.gpu_step:(d + 1) * self.gpu_step]) print("GPU:", d) tvars = tf.trainable_variables() initialized_variable_names = {} if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_layer = self.model.get_pooled_output() logging.info(output_layer) if self.is_training == True: output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) match_1 = tf.strided_slice(output_layer, [0], [self.gpu_step], [2]) match_2 = tf.strided_slice(output_layer, [1], [self.gpu_step], [2]) match = tf.concat([match_1, match_2], 1) self.logits = tf.layers.dense(match, self.num_labels, name='fc', reuse=tf.AUTO_REUSE) logging.info(self.logits) self.r_labels = tf.strided_slice( self.labels[d * self.gpu_step:(d + 1) * self.gpu_step], [0], [self.gpu_step], [2]) logging.info(self.r_labels) self.r_labels = tf.expand_dims(self.r_labels, -1) logging.info(self.r_labels) self.loss = tf.losses.mean_squared_error( self.logits, self.r_labels) tvars = tf.trainable_variables() grads = tf.gradients(self.loss, tvars) (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) self.tower_grads.append(list(zip(grads, tvars))) self.losses.append(self.loss) label.append(self.r_labels) pred.append(self.logits) outer_scope.reuse_variables() with tf.name_scope("apply_gradients"), tf.device("/cpu:0"): gradients = self.average_gradients(self.tower_grads) train_op = optimizer.apply_gradients(gradients, global_step=global_step) new_global_step = global_step + 1 self.train_op = tf.group(train_op, [global_step.assign(new_global_step)]) self.losses = tf.reduce_mean(self.losses) self.pred = tf.concat(pred, 0) self.label = tf.concat(label, 0) logging.info(self.pred) logging.info(self.label)
def module_fn(is_training): """Module function.""" input_ids = tf.placeholder(tf.int32, [None, None], "input_ids") input_mask = tf.placeholder(tf.int32, [None, None], "input_mask") segment_ids = tf.placeholder(tf.int32, [None, None], "segment_ids") mlm_positions = tf.placeholder(tf.int32, [None, None], "mlm_positions") albert_config_path = os.path.join(FLAGS.albert_directory, "albert_config.json") albert_config = modeling.AlbertConfig.from_json_file(albert_config_path) model = modeling.AlbertModel(config=albert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False, use_einsum=FLAGS.use_einsum) mlm_logits = get_mlm_logits(model, albert_config, mlm_positions) # sop_log_probs = get_sop_log_probs(model, albert_config) vocab_model_path = os.path.join(FLAGS.albert_directory, "30k-clean.model") vocab_file_path = os.path.join(FLAGS.albert_directory, "30k-clean.vocab") config_file = tf.constant(value=albert_config_path, dtype=tf.string, name="config_file") vocab_model = tf.constant(value=vocab_model_path, dtype=tf.string, name="vocab_model") # This is only for visualization purpose. vocab_file = tf.constant(value=vocab_file_path, dtype=tf.string, name="vocab_file") # By adding `config_file, vocab_model and vocab_file` # to the ASSET_FILEPATHS collection, TF-Hub will # rewrite this tensor so that this asset is portable. tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file) tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_model) tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file) hub.add_signature(name="tokens", inputs=dict(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids), outputs=dict(sequence_output=model.get_sequence_output(), pooled_output=model.get_pooled_output())) # change # hub.add_signature( # name="sop", # inputs=dict( # input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids), # outputs=dict( # sequence_output=model.get_sequence_output(), # pooled_output=model.get_pooled_output(), # sop_log_probs=sop_log_probs)) hub.add_signature(name="mlm", inputs=dict(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, mlm_positions=mlm_positions), outputs=dict(sequence_output=model.get_sequence_output(), pooled_output=model.get_pooled_output(), mlm_logits=mlm_logits)) hub.add_signature(name="tokenization_info", inputs={}, outputs=dict(vocab_file=vocab_model, do_lower_case=tf.constant( FLAGS.do_lower_case)))
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] # Note: We keep this feature name `next_sentence_labels` to be compatible # with the original data created by lanzhzh@. However, in the ALBERT case # it does represent sentence_order_labels. sentence_order_labels = features["next_sentence_labels"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.AlbertModel( config=albert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( albert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (sentence_order_loss, sentence_order_example_loss, sentence_order_log_probs) = get_sentence_order_output( albert_config, model.get_pooled_output(), sentence_order_labels) total_loss = masked_lm_loss + sentence_order_loss tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: tf.logging.info("number of hidden group %d to initialize", albert_config.num_hidden_groups) num_of_initialize_group = 1 if FLAGS.init_from_group0: num_of_initialize_group = albert_config.num_hidden_groups if albert_config.net_structure_type > 0: num_of_initialize_group = albert_config.num_hidden_layers (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint, num_of_initialize_group) if use_tpu: def tpu_scaffold(): for gid in range(num_of_initialize_group): tf.logging.info("initialize the %dth layer", gid) tf.logging.info(assignment_map[gid]) tf.train.init_from_checkpoint(init_checkpoint, assignment_map[gid]) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: for gid in range(num_of_initialize_group): tf.logging.info("initialize the %dth layer", gid) tf.logging.info(assignment_map[gid]) tf.train.init_from_checkpoint(init_checkpoint, assignment_map[gid]) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, optimizer, poly_power, start_warmup_step) output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(*args): """Computes the loss and accuracy of the model.""" (masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, sentence_order_example_loss, sentence_order_log_probs, sentence_order_labels) = args[:7] masked_lm_log_probs = tf.reshape( masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) metrics = { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, } sentence_order_log_probs = tf.reshape( sentence_order_log_probs, [-1, sentence_order_log_probs.shape[-1]]) sentence_order_predictions = tf.argmax( sentence_order_log_probs, axis=-1, output_type=tf.int32) sentence_order_labels = tf.reshape(sentence_order_labels, [-1]) sentence_order_accuracy = tf.metrics.accuracy( labels=sentence_order_labels, predictions=sentence_order_predictions) sentence_order_mean_loss = tf.metrics.mean( values=sentence_order_example_loss) metrics.update({ "sentence_order_accuracy": sentence_order_accuracy, "sentence_order_loss": sentence_order_mean_loss }) return metrics metric_values = [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, sentence_order_example_loss, sentence_order_log_probs, sentence_order_labels ] eval_metrics = (metric_fn, metric_values) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) return output_spec
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, token_label_ids, predicate_matrix_ids, num_token_labels, num_predicate_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.AlbertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # We "pool" the model by simply taking the hidden state corresponding # to the first token. float Tensor of shape [batch_size, hidden_size] # model_pooled_output = model.get_pooled_output() # """Gets final hidden layer of encoder. # # Returns: # float Tensor of shape [batch_size, seq_length, hidden_size] corresponding # to the final hidden of the transformer encoder. # """ sequence_bert_encode_output = model.get_sequence_output() if is_training: sequence_bert_encode_output = tf.nn.dropout(sequence_bert_encode_output, keep_prob=0.9) with tf.variable_scope("predicate_head_select_loss"): bert_sequenc_length = sequence_bert_encode_output.shape[-2].value #shape [batch_size, sequence_length, sequencd_length, predicate_label_numbers] predicate_score_matrix = getHeadSelectionScores(encode_input=sequence_bert_encode_output, hidden_size_n1=100, label_number=num_predicate_labels) predicate_head_probabilities = tf.nn.sigmoid(predicate_score_matrix) #predicate_head_prediction = tf.argmax(predicate_head_probabilities, axis=3) predicate_head_predictions_round = tf.round(predicate_head_probabilities) predicate_head_predictions = tf.cast(predicate_head_predictions_round, tf.int32) #shape [batch_size, sequence_length, sequencd_length] predicate_matrix = tf.reshape(predicate_matrix_ids, [-1, bert_sequenc_length, bert_sequenc_length]) #shape [batch_size, sequence_length, sequencd_length, predicate_label_numbers] gold_predicate_matrix_one_hot = tf.one_hot(predicate_matrix, depth=num_predicate_labels, dtype=tf.float32) # shape [batch_size, sequence_length, sequencd_length, predicate_label_numbers] predicate_sigmoid_cross_entropy_with_logits = tf.nn.sigmoid_cross_entropy_with_logits( logits=predicate_score_matrix, labels=gold_predicate_matrix_one_hot) # shape [] predicate_head_select_loss = tf.reduce_sum(predicate_sigmoid_cross_entropy_with_logits) # return predicate_head_probabilities, predicate_head_predictions, predicate_head_select_loss with tf.variable_scope("token_label_loss"): bert_encode_hidden_size = sequence_bert_encode_output.shape[-1].value token_label_output_weight = tf.get_variable( "token_label_output_weights", [num_token_labels, bert_encode_hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02) ) token_label_output_bias = tf.get_variable( "token_label_output_bias", [num_token_labels], initializer=tf.zeros_initializer() ) sequence_bert_encode_output = tf.reshape(sequence_bert_encode_output, [-1, bert_encode_hidden_size]) token_label_logits = tf.matmul(sequence_bert_encode_output, token_label_output_weight, transpose_b=True) token_label_logits = tf.nn.bias_add(token_label_logits, token_label_output_bias) token_label_logits = tf.reshape(token_label_logits, [-1, FLAGS.max_seq_length, num_token_labels]) token_label_log_probs = tf.nn.log_softmax(token_label_logits, axis=-1) token_label_one_hot_labels = tf.one_hot(token_label_ids, depth=num_token_labels, dtype=tf.float32) token_label_per_example_loss = -tf.reduce_sum(token_label_one_hot_labels * token_label_log_probs, axis=-1) token_label_loss = tf.reduce_sum(token_label_per_example_loss) token_label_probabilities = tf.nn.softmax(token_label_logits, axis=-1) token_label_predictions = tf.argmax(token_label_probabilities, axis=-1) # return (token_label_loss, token_label_per_example_loss, token_label_logits, token_label_predict) loss = predicate_head_select_loss + token_label_loss return (loss, predicate_head_select_loss, predicate_head_probabilities, predicate_head_predictions, token_label_loss, token_label_per_example_loss, token_label_logits, token_label_predictions)