def gather_indexes(sequence_tensor, positions): """Gathers the vectors at the specific positions over a minibatch.""" sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) flat_positions = tf.reshape(positions + flat_offsets, [-1]) flat_sequence_tensor = tf.reshape(sequence_tensor, [batch_size * seq_length, width]) output_tensor = tf.gather(flat_sequence_tensor, flat_positions) return output_tensor
def gather_indexes(self): sequence_shape = modeling.get_shape_list(self.input_tensor, expected_rank=3) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) flat_positions = tf.reshape(self.masked_lm_positions + flat_offsets, [-1]) flat_sequence_tensor = tf.reshape(self.input_tensor, [batch_size * seq_length, width]) output_tensor = tf.gather(flat_sequence_tensor, flat_positions) return output_tensor
def compute_joint_mlp_logits(sequence, max_span_length): """Computes joint span (start, end) logits from sequence input.""" batch_size, seq_length, hidden_size = modeling.get_shape_list( sequence, expected_rank=3) projection_size = hidden_size # This seems to be a reasonable setting. with tf.variable_scope("joint_span"): projection = tf.layers.dense( sequence, projection_size * 2, activation=None, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02), name="projection") start_projection, end_projection = tf.split(projection, 2, axis=-1) # 1. The start representations are tiled max_answer_length times. # TODO(danielandor): Use the mask to compute an optimal span list. starts = tf.reshape(start_projection, [batch_size * seq_length, 1, projection_size]) starts = tf.tile(starts, [1, max_span_length, 1]) starts = tf.reshape( starts, [batch_size, seq_length * max_span_length, projection_size]) # 2. To make the end representations, we compute band diagonal indices and # perform a batched gather. seqs = tf.expand_dims(tf.range(seq_length), 1) offsets = tf.expand_dims(tf.range(max_span_length), 0) indices = seqs + offsets # uses broadcasting indices.shape.assert_is_compatible_with((seq_length, max_span_length)) indices = tf.reshape(indices, [1, seq_length * max_span_length]) indices = tf.tile(indices, [batch_size, 1]) indices = tf.minimum(indices, seq_length - 1) # clips indices ends = tf.batch_gather(end_projection, indices) # 3. The final step adds the starts and ends. ends.shape.assert_is_compatible_with(starts.shape) inputs = starts + ends inputs = modeling.gelu(inputs) # Bias is already in the projection. inputs = contrib_layers.layer_norm(inputs) start_logits = tf.layers.dense( inputs, 1, activation=None, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02), name="logits") return tf.reshape(start_logits, [batch_size, seq_length, max_span_length])
def mask_joint_logits(input_mask, start_end_logits): """Masks logits based on input mask and valid start/end combinations.""" _, _, length = modeling.get_shape_list(start_end_logits, expected_rank=3) mask = tf.TensorArray(input_mask.dtype, size=length, dynamic_size=False) for i in range(length): mask = mask.write(i, input_mask) # The permitted span length is determined by the existing mask combined # with its being shifted up by one. input_mask = input_mask * tf.pad(input_mask[:, 1:], [[0, 0], [0, 1]]) mask = mask.stack() mask = tf.transpose(mask, [1, 2, 0]) mask.shape.assert_is_compatible_with(start_end_logits.shape) start_end_logits -= 1e6 * tf.cast(1 - mask, tf.float32) return start_end_logits
def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" example = tf.parse_single_example(record, name_to_features) # Convert image to float tensor. image = example["image"] image_decoded = tf.image.decode_jpeg(image, channels=3) image_decoded.set_shape([None, None, 3]) image_float = tf.image.convert_image_dtype(image_decoded, tf.float32) image_resized = tf.image.resize_image_with_pad(image_float, IMG_HEIGHT, IMG_WIDTH) example["image"] = tf.reshape(image_resized, [IMG_HEIGHT, IMG_WIDTH, 3]) # Get bboxes. if FLAGS.use_bboxes: example["bbox_pos"] = tf.to_int32(example["bbox_pos"]) bboxes = [] for idx in range(FLAGS.max_num_bboxes): bboxes.append( parse_bounding_box(IMG_HEIGHT, IMG_WIDTH, image_float, example["bbox_pos"][idx, :])) example["bboxes"] = tf.stack(bboxes) if FLAGS.use_bbox_position: # Resized bboxes. y, x, bbox_height, bbox_width = tf.unstack(example["bbox_pos"], axis=1) orig_height, orig_width = modeling.get_shape_list( image_float)[:2] example["bbox_pos"] = tf.cast(tf.stack([ IMG_HEIGHT * y / orig_height, IMG_WIDTH * x / orig_width, IMG_HEIGHT * bbox_height / orig_height, IMG_WIDTH * bbox_width / orig_width ], 1), dtype=tf.int32) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids) output_layer = model.get_sequence_output() output_layer_shape = modeling.get_shape_list(output_layer, expected_rank=3) #batch_size = output_layer_shape[0] #seq_length = output_layer_shape[1] hidden_size = output_layer_shape[2] output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) #output_layer = tf.reshape(output_layer, [batch_size*seq_length, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) one_hot_labels = tf.one_hot(label_ids, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) per_example_loss = per_example_loss * tf.cast(input_mask, tf.float32) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, probabilities, predictions)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model to classify if para contains answer for given question and position of start and end token """ model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # using the model exactly same as the one used for SQUAD but one layer of classification can be added to do classification task of context contains answer or not final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable("cls/emr/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("cls/emr/output_bias", [2], initializer=tf.zeros_initializer()) # to be able to initialize checkpoints with SQUAD trained BERT QA model, replace 'emr' by 'squad' in names of above variables final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) has_answer_output_layer = model.get_pooled_output() has_answer_hidden_size = has_answer_output_layer.shape[-1].value has_answer_types = 2 # YES, NO has_answer_output_weights = tf.get_variable( "has_answer_output_weights", [has_answer_types, has_answer_hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) has_answer_output_bias = tf.get_variable( "has_answer_output_bias", [has_answer_types], initializer=tf.zeros_initializer()) has_answer_logits = tf.matmul(has_answer_output_layer, has_answer_output_weights, transpose_b=True) has_answer_logits = tf.nn.bias_add(has_answer_logits,has_answer_output_bias) return (start_logits, end_logits, has_answer_logits)
def create_cnn_model(is_training, token_embeddings, config, batch_size, segment_ids=None, name="CNN"): """Creates a classification model.""" input_shape = get_shape_list(token_embeddings, expected_rank=3) # batch_size = input_shape[0] seq_length = input_shape[1] hidden_size = input_shape[2] channels_in = 1 conv = tf.reshape(token_embeddings, [batch_size, seq_length, hidden_size, channels_in]) for i, filter_shape in enumerate(config.filter_shapes): pool_shape = config.pool_shapes[i] conv = _conv_layer(conv, filter_shape, pool_shape, channels_in, config.channels_out[i], ('convfilter%d' % i)) w_out = (conv.shape[-1] * conv.shape[-2]).value n_positions = 2 # start and end logits wd1 = tf.Variable(tf.truncated_normal([w_out, n_positions], stddev=0.03), name='wd1') bd1 = tf.Variable(tf.truncated_normal([n_positions], stddev=0.01), name='bd1') conv = tf.reshape(conv, [batch_size * seq_length, w_out]) logits = tf.matmul(conv, wd1) logits = tf.nn.bias_add(logits, bd1) logits = tf.reshape(logits, [batch_size, seq_length, n_positions]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) return (start_logits, end_logits)
def build_block_attn_single_arg(self, scope=None): tf.logging.debug("Building single arg pipeline in BERT Embedding") self.arg = tf.placeholder(tf.int32, [None, self.max_arg_length], name='arg') self.label = tf.placeholder(tf.int32, [None], name="label") # TODO: max_len for conn??? self.conn = tf.placeholder(tf.int32, [None, self.max_arg_length], name="conn") self.arg_attn_mask = tf.placeholder(tf.int32, [None, self.max_arg_length], name="arg_attn_mask") segment_ids = tf.zeros_like(self.arg, dtype=tf.int32) bert_model = modeling.BertModel(config=self.bert_config, is_training=self.is_training, input_ids=self.arg, input_mask=self.arg_attn_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False, scope='bert') bert_arg = bert_model.get_sequence_output() # # custom # self.build_bert_model(input_ids=self.arg, # input_mask=self.arg_attn_mask, # token_type_ids=segment_ids) # # bert_arg = self.sequence_output input_shape = modeling.get_shape_list(bert_arg, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] self.bert_arg = tf.reshape( bert_arg, shape=[batch_size / 2, seq_length * 2, width]) self.bert_mask_concat = tf.reshape( self.arg_attn_mask, shape=[batch_size / 2, seq_length * 2])
def run_one_hot_embeddings(one_hot_input_ids, config): """Extract only the word embeddings of the original BERT model.""" with tf.variable_scope("bert", reuse=tf.compat.v1.AUTO_REUSE): with tf.variable_scope("embeddings"): # branched from modeling.embedding_lookup embedding_table = tf.get_variable( name="word_embeddings", shape=[config.vocab_size, config.hidden_size], initializer=modeling.create_initializer( config.initializer_range)) flat_input_ids = tf.reshape(one_hot_input_ids, [-1, config.vocab_size]) output = tf.matmul(flat_input_ids, embedding_table) input_shape = modeling.get_shape_list(one_hot_input_ids) output = tf.reshape(output, input_shape[0:-1] + [config.hidden_size]) return (output, embedding_table)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable( "cls/squad/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) return (start_logits, end_logits)
def dense_layer_3d(input_tensor, num_attention_heads, size_per_head, initializer, activation, name=None): """A dense layer with 3D kernel. Args: input_tensor: float Tensor of shape [batch, seq_length, hidden_size]. num_attention_heads: Number of attention heads. size_per_head: The size per attention head. initializer: Kernel initializer. activation: Actication function. name: The name scope of this layer. Returns: float logits Tensor. """ last_dim = modeling.get_shape_list(input_tensor)[-1] with tf.variable_scope(name): w = tf.get_variable( name="kernel", shape=[last_dim, num_attention_heads * size_per_head], initializer=initializer) w = tf.reshape(w, [last_dim, num_attention_heads, size_per_head]) b = tf.get_variable(name="bias", shape=[num_attention_heads * size_per_head], initializer=tf.zeros_initializer) b = tf.reshape(b, [num_attention_heads, size_per_head]) ret = tf.einsum("abc,cde->abde", input_tensor, w) ret += b if activation is not None: return activation(ret) else: return ret
def create_fully_connected_model(is_training, token_embeddings, config, batch_size, segment_ids): """Creates a classification model.""" input_shape = get_shape_list(token_embeddings, expected_rank=3) # batch_size = input_shape[0] seq_length = input_shape[1] hidden_size = input_shape[2] channels_in = 1 n_positions = 2 # start and end logits output_weights = tf.get_variable( "cls/squad/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) if config.mask_questions: token_embeddings = mask_questions_batch(token_embeddings, segment_ids, hidden_size) final_hidden_matrix = tf.reshape(token_embeddings, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) return (start_logits, end_logits)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) label = features["label"] is_training = (mode == tf_estimator.ModeKeys.TRAIN) module = hub.Module( "https://tfhub.dev/google/imagenet/resnet_v2_152/feature_vector/3", trainable=FLAGS.trainable_resnet) batch_size = params["batch_size"] height, width = hub.get_expected_image_size(module) # First extract a global image vector. image_vector = module(tf.reshape(features["image"], [batch_size, height, width, 3]), signature="image_feature_vector", as_dict=True)["default"] image_vector = tf.reshape(image_vector, [batch_size, 1, FLAGS.image_vector_dim]) # The global image vector is added in the position of the [IMAGE] token, # which comes right after the [CLS] token. image_embeddings = tf.concat([ tf.zeros([batch_size, 1, FLAGS.image_vector_dim]), image_vector, tf.zeros( [batch_size, FLAGS.max_seq_length - 2, FLAGS.image_vector_dim]) ], axis=1) if FLAGS.use_bboxes: # Then extract an image vector for each of the bounding boxes (at most # FLAGS.max_num_bboxes). boxes_vectors = module(tf.reshape( features["bboxes"], [batch_size * FLAGS.max_num_bboxes, height, width, 3]), signature="image_feature_vector", as_dict=True)["default"] boxes_vectors = tf.reshape( boxes_vectors, [batch_size, FLAGS.max_num_bboxes, FLAGS.image_vector_dim]) if FLAGS.use_bbox_position: tf.logging.info("Embedding bbox position with 56 positions.") # Position embedding for bbox location. def _make_position_embedding_table(scope): # 56 is 224 / 4 return tf.get_variable( scope, [56, FLAGS.image_vector_dim // 4], initializer=tf.truncated_normal_initializer( stddev=0.02)) position_x_embeds = _make_position_embedding_table( "position_x_embeddings") position_y_embeds = _make_position_embedding_table( "position_y_embeddings") # bbox_pos features are top left corner in image, and height and width. bbox_pos = features["bbox_pos"] // 4 y1 = tf.one_hot(bbox_pos[:, :, 0], 56) x1 = tf.one_hot(bbox_pos[:, :, 1], 56) y2 = tf.one_hot(bbox_pos[:, :, 0] + bbox_pos[:, :, 2], 56) x2 = tf.one_hot(bbox_pos[:, :, 1] + bbox_pos[:, :, 3], 56) bbox_x_embeds = tf.einsum("bixc,cd->bixd", tf.stack([x1, x2], axis=2), position_x_embeds) bbox_y_embeds = tf.einsum("biyc,cd->biyd", tf.stack([y1, y2], axis=2), position_y_embeds) # [batch_size, max_num_bboxes, image_vector_size] bbox_pos_embeds = tf.concat([ tf.reshape(bbox_x_embeds, [batch_size, FLAGS.max_num_bboxes, -1]), tf.reshape(bbox_y_embeds, [batch_size, FLAGS.max_num_bboxes, -1]) ], -1) boxes_vectors += bbox_pos_embeds # Now place the image vectors of each bounding box in the position of each # special token that references that bounding box. The letters in the # einsum mean the following: # b: batch element index # t: token index # i: bounding box index # d: depth of image representation image_embeddings += tf.einsum( "bti,bid->btd", tf.one_hot(features["bbox_idx"], FLAGS.max_num_bboxes), boxes_vectors) with tf.variable_scope("bert") as scope: # This is just like a regular BERT model, but the given image embeddings # are added to the input word piece embeddings. model = B2T2Model(config=bert_config, is_training=is_training, input_ids=features["input_ids"], image_embeddings=image_embeddings, input_mask=features["input_mask"], token_type_ids=features["segment_ids"], use_one_hot_embeddings=use_one_hot_embeddings, scope=scope) output_weights = tf.get_variable( "output_weights", [FLAGS.num_output_labels, bert_config.hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [FLAGS.num_output_labels], initializer=tf.zeros_initializer()) logits = tf.matmul(model.get_pooled_output(), output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf_estimator.ModeKeys.TRAIN: loss = 0.0 if FLAGS.negative_loss: tf.logging.info("Using negative loss.") loss += tf.losses.sparse_softmax_cross_entropy( label, logits, reduction=tf.losses.Reduction.MEAN) if FLAGS.mask_lm_loss: tf.logging.info("Using mask LM loss.") # Don't use mask LM for negative captions. masked_lm_label_weights = features["masked_lm_weights"] masked_lm_label_weights *= tf.expand_dims( tf.cast(label, tf.float32), -1) (masked_lm_loss, _, _) = run_pretraining.get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), features["masked_lm_positions"], features["masked_lm_ids"], masked_lm_label_weights) loss += masked_lm_loss train_op = optimization.create_optimizer(loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf_estimator.ModeKeys.PREDICT: predictions = { # Image, annotation and choice identifiers. "img_id": features["img_id"], "annot_id": features["annot_id"], # Gold label. "label": label, } if FLAGS.mask_lm_loss: # We don't care about masked_lm_weights for prediction. _, _, masked_lm_log_probs = run_pretraining.get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), features["masked_lm_positions"], features["masked_lm_ids"], features["masked_lm_weights"]) # [batch_size * max_preds_per_seq] masked_lm_preds = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) batch_size, max_preds_per_seq = modeling.get_shape_list( features["masked_lm_positions"]) masked_lm_preds = tf.reshape(masked_lm_preds, [batch_size, max_preds_per_seq]) predictions.update({ "input_ids": features["input_ids"], "masked_lm_ids": features["masked_lm_ids"], "masked_lm_preds": masked_lm_preds }) if FLAGS.has_choice_id: predictions["choice_id"] = features["choice_id"] if FLAGS.do_output_logits: predictions["output_logits"] = logits else: predictions["output_label"] = tf.argmax(logits, axis=-1) output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and PREDICT modes are supported: %s" % (mode)) return output_spec
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, span_encoding, max_answer_length, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # Get the logits for the start and end predictions. final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] if span_encoding == "independent": output_weights = tf.get_variable( "cls/coqa/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("cls/coqa/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape( final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) start_logits, end_logits = tf.unstack(logits, axis=2) elif span_encoding == "concat-mlp": with tf.variable_scope("coqa"): if is_training: # The batch size can be variable during inference. final_hidden.shape.assert_is_compatible_with( (batch_size, seq_length, hidden_size)) start_logits = compute_joint_mlp_logits(final_hidden, max_answer_length) start_logits = mask_joint_logits(input_mask, start_logits) end_logits = tf.zeros([batch_size], dtype=tf.float32) # dummy else: raise ValueError("Unknown span_encoding: %s" % span_encoding) # Get the logits for the answer type prediction. # TODO(epitler): Try variants here. answer_type_output_layer = model.get_pooled_output() answer_type_hidden_size = answer_type_output_layer.shape[-1].value num_answer_types = 5 # YES, NO, UNKNOWN, EXTRACTIVE, ABSTRACTIVE answer_type_output_weights = tf.get_variable( "answer_type_output_weights", [num_answer_types, answer_type_hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) answer_type_output_bias = tf.get_variable( "answer_type_output_bias", [num_answer_types], initializer=tf.zeros_initializer()) answer_type_logits = tf.matmul(answer_type_output_layer, answer_type_output_weights, transpose_b=True) answer_type_logits = tf.nn.bias_add(answer_type_logits, answer_type_output_bias) return (start_logits, end_logits, answer_type_logits)
def create_mask_model(bert_config, is_training, input_ids, input_mask, segment_ids, mask_positions, use_one_hot_embeddings): """Creates a classification model.""" #print("create mask model ----------------------------------------------") model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # Get the logits for the start and end predictions. final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable( "cls/nq/output_weights", [2, hidden_size + 12], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("cls/nq/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) mask_positions_matrix = tf.cast(tf.reshape(mask_positions, [batch_size * seq_length, 1]), dtype=tf.float32) padding = tf.zeros([batch_size * seq_length, 11], dtype=tf.float32) mask_positions_matrix = tf.concat([mask_positions_matrix, padding], axis=-1) final_hidden_matrix = tf.concat( [final_hidden_matrix, mask_positions_matrix], axis=-1) final_hidden_matrix = tf.reshape( final_hidden_matrix, [batch_size, seq_length, hidden_size + 12]) attention_mask = modeling.create_attention_mask_from_input_mask( input_ids, input_mask) config = bert_config all_encoder_layers = modeling.transformer_model( input_tensor=final_hidden_matrix, attention_mask=attention_mask, hidden_size=config.hidden_size + 12, # input hidden size num_hidden_layers=1, #config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config.attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) #print(all_encoder_layers.shape) transformer_output_matrix = all_encoder_layers[-1] transformer_output_matrix = tf.reshape( transformer_output_matrix, [batch_size * seq_length, hidden_size + 12]) logits = tf.matmul(transformer_output_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) # Get the logits for the answer type prediction. answer_type_output_layer = model.get_pooled_output() answer_type_hidden_size = answer_type_output_layer.shape[-1].value num_answer_types = 5 # YES, NO, UNKNOWN, SHORT, LONG answer_type_output_weights = tf.get_variable( "answer_type_output_weights", [num_answer_types, answer_type_hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) answer_type_output_bias = tf.get_variable( "answer_type_output_bias", [num_answer_types], initializer=tf.zeros_initializer()) answer_type_logits = tf.matmul(answer_type_output_layer, answer_type_output_weights, transpose_b=True) answer_type_logits = tf.nn.bias_add(answer_type_logits, answer_type_output_bias) return (start_logits, end_logits, answer_type_logits)
def create_contextualized_cnn_model(is_training, token_embeddings, config, batch_size, segment_ids=None, name="deconv_model"): """ Impossible to train. I was getting examples/sec: 0.152973 May need to take a closer look here and figure out if something is wrong or perhaps even implement this ourselves. https://arxiv.org/pdf/1603.07285.pdf """ print('batch_size: %d' % batch_size) input_shape = get_shape_list(token_embeddings, expected_rank=3) # batch_size = 32 # NOTE: must do this for prediction input_shape[0] seq_length = input_shape[1] channels_in = 1 downsized_input = token_embeddings if len(config.cnn_downsize.filter_shapes) > 0: downsized_input = apply_conv_layers( is_training, token_embeddings, config.cnn_downsize, dropout_rate=0., name='contextualized_cnn/downsizer', ) # This does not work during prediction # assert downsized_input.shape[0].value == batch_size assert downsized_input.shape[1].value == seq_length downsized_channels_out = downsized_input.shape[-1].value paragraphs = downsized_input if config.mask_questions: paragraphs = mask_questions_batch(downsized_input, segment_ids, downsized_channels_out) filters = apply_conv_layers(is_training, downsized_input, config.filter_generator, name='contextualized_cnn/filter_generator', dropout_rate=0.) # This does not work during prediction # assert filters.shape[0].value == batch_size assert filters.shape[1].value == seq_length n_filters = int(filters.shape[2].value / downsized_channels_out) assert (n_filters % 2) == 0 filter_generator_pooling = config.filter_generator_pooling pooling_size = [filter_generator_pooling['size'], 1, 1] pooling_strides = [filter_generator_pooling['strides'], 1, 1] if config.filter_generator_pooling['type'] == 'max': filters = tf.nn.max_pool1d(filters, pooling_size, pooling_strides, 'VALID') elif config.filter_generator_pooling['type'] == 'avg': filters = tf.nn.avg_pool1d(filters, pooling_size, pooling_strides, 'VALID') contextualized = apply_per_sample_conv1d(paragraphs, filters, n_filters, batch_size) (start_features, end_features) = tf.split(contextualized, 2, axis=2) feature_channels_out = int(n_filters / 2.) def compute_logits(features): wd1 = tf.Variable(tf.truncated_normal([feature_channels_out, 1], stddev=0.03), name='wd1') bd1 = tf.Variable(tf.truncated_normal([1], stddev=0.01), name='bd1') logits = tf.matmul(features, wd1) logits = tf.nn.bias_add(logits, bd1) logits = tf.reshape(logits, [batch_size, seq_length]) return logits start_logits = compute_logits(start_features) end_logits = compute_logits(end_features) return (start_logits, end_logits)
def _create_model(self, mode, input_ids, input_mask, segment_ids, labels, slot_labels, labels_mask, drop_keep_prob, entity_type_ids, sequence_lengths): """Creates a LaserTagger model.""" is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel( config=self._config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=self._use_one_hot_embeddings) final_layer = model.get_sequence_output() # final_hidden = model.get_pooled_output() if is_training: # I.e., 0.1 dropout # final_hidden = tf.nn.dropout(final_hidden, keep_prob=drop_keep_prob) final_layer = tf.nn.dropout(final_layer, keep_prob=drop_keep_prob) # 结合实体信息 batch_size, seq_length = modeling.get_shape_list(input_ids) self.entity_type_embedding = tf.get_variable( name="entity_type_embedding", shape=(self.entity_type_num, self._config.hidden_size), dtype=tf.float32, trainable=True, initializer=tf.random_uniform_initializer( -self._config.initializer_range * 100, self._config.initializer_range * 100, seed=20)) with tf.init_scope(): impact_weight_init = tf.constant(1.0 / self.entity_type_num, dtype=tf.float32, shape=(1, self.entity_type_num)) self.impact_weight = tf.Variable(impact_weight_init, dtype=tf.float32, name="impact_weight") # 不同类型的影响权重 impact_weight_matrix = tf.tile(self.impact_weight, multiples=[batch_size * seq_length, 1]) entity_type_ids_matrix1 = tf.cast(tf.reshape( entity_type_ids, [batch_size * seq_length, self.entity_type_num]), dtype=tf.float32) entity_type_ids_matrix = tf.multiply(entity_type_ids_matrix1, impact_weight_matrix) entity_type_emb = tf.matmul(entity_type_ids_matrix, self.entity_type_embedding) final_layer = final_layer + tf.reshape(entity_type_emb, [ batch_size, seq_length, self._config.hidden_size ]) # TODO TODO # 0.7071067811865476是二分之根号二 # final_layer = tf.concat([final_layer, tf.reshape(entity_type_emb, [batch_size, seq_length,self._config.hidden_size])], axis=-1) if is_training: final_layer = tf.nn.dropout(final_layer, keep_prob=drop_keep_prob) (output_fw_seq, output_bw_seq), ((c_fw, h_fw), (c_bw, h_bw)) = tf.nn.bidirectional_dynamic_rnn( cell_fw=LSTMCell(self.lstm_hidden_size), cell_bw=LSTMCell(self.lstm_hidden_size), inputs=final_layer, sequence_length=sequence_lengths, dtype=tf.float32) layer_matrix = tf.concat([output_fw_seq, output_bw_seq], axis=-1) final_hidden = tf.concat([c_fw, c_bw], axis=-1) layer_matrix = tf.contrib.layers.layer_norm(inputs=layer_matrix, begin_norm_axis=-1, begin_params_axis=-1) intent_logits = tf.layers.dense( final_hidden, self._num_tags, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02), name="output_projection") slot_logits = tf.layers.dense( layer_matrix, self.num_slot_tags, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02), name="slot_projection") with tf.variable_scope("loss"): loss = None per_example_intent_loss = None per_example_slot_loss = None if mode != tf.estimator.ModeKeys.PREDICT: per_example_intent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=intent_logits) slot_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=slot_labels, logits=slot_logits) per_example_slot_loss = tf.truediv( tf.reduce_sum(slot_loss, axis=1), tf.cast(tf.reduce_sum(labels_mask, axis=1), tf.float32)) # from tensorflow.contrib.crf import crf_log_likelihood # from tensorflow.contrib.crf import viterbi_decode # batch_size = tf.shape(slot_logits)[0] # print(curLine(), batch_size, tf.constant([self._max_seq_length])) # length_batch = tf.tile(tf.constant([self._max_seq_length]), [batch_size]) # print(curLine(), batch_size, "length_batch:", length_batch) # per_example_slot_loss, self.transition_params = crf_log_likelihood(inputs=slot_logits, # tag_indices=slot_labels,sequence_lengths=length_batch) # print(curLine(), "per_example_slot_loss:", per_example_slot_loss) # shape=(batch_size,) # print(curLine(), "self.transition_params:", self.transition_params) # shape=(9, 9) loss = tf.reduce_mean(self.intent_ratio * per_example_intent_loss + self.slot_ratio * per_example_slot_loss) pred_intent = tf.cast(tf.argmax(intent_logits, axis=-1), tf.int32) pred_slot = tf.cast(tf.argmax(slot_logits, axis=-1), tf.int32) return (loss, per_example_slot_loss, pred_intent, pred_slot, batch_size, entity_type_emb, impact_weight_matrix, entity_type_ids_matrix, final_layer, slot_logits)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model.""" model = TFXLMRobertaModel.from_pretrained('xlm_model', from_pt=True) #for i in model.trainable_variables: #print(i.name) outputs = model(inputs=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, training=is_training) final_hidden = outputs[0] # Get the logits for the start and end predictions. #final_hidden = model.get_sequence_output() final_hidden_shape = bert_modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable( "cls/tydi/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("cls/tydi/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) # Get the logits for the answer type prediction. #answer_type_output_layer = model.get_pooled_output() answer_type_output_layer = outputs[1] answer_type_hidden_size = answer_type_output_layer.shape[-1] num_answer_types = 5 # YES, NO, UNKNOWN, PASSAGE, MINIMAL answer_type_output_weights = tf.get_variable( "answer_type_output_weights", [num_answer_types, answer_type_hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) answer_type_output_bias = tf.get_variable( "answer_type_output_bias", [num_answer_types], initializer=tf.zeros_initializer()) answer_type_logits = tf.matmul(answer_type_output_layer, answer_type_output_weights, transpose_b=True) answer_type_logits = tf.nn.bias_add(answer_type_logits, answer_type_output_bias) return start_logits, end_logits, answer_type_logits
def build_attn_layer(self, input_tensor, attn_mask_concat, layer_attn_type, num_attention_heads=1, size_per_head=512, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_2d_tensor=False): # TODO (May 5): To capture each softmax output, will need a modified # `attention_layer` input_tensor_shape = modeling.get_shape_list(input_tensor, expected_rank=3) batch_size = input_tensor_shape[0] total_seq_length = input_tensor_shape[1] arg_seq_length = int(total_seq_length / 2) attention_head = None if layer_attn_type == "self": attn_mask = modeling.create_attention_mask_from_input_mask( input_tensor, attn_mask_concat) attention_head = modeling.attention_layer( from_tensor=input_tensor, to_tensor=input_tensor, attention_mask=attn_mask, num_attention_heads=num_attention_heads, size_per_head=size_per_head, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=do_return_2d_tensor, batch_size=batch_size, from_seq_length=total_seq_length, to_seq_length=total_seq_length ) else: arg1 = input_tensor[:, :arg_seq_length, :] arg2 = input_tensor[:, arg_seq_length:, :] arg1_attn_mask = attn_mask_concat[:, :arg_seq_length] arg1_attn_mask = modeling.create_attention_mask_from_input_mask( arg1, arg1_attn_mask) arg2_attn_mask = attn_mask_concat[:, arg_seq_length:] arg2_attn_mask = modeling.create_attention_mask_from_input_mask( arg2, arg2_attn_mask) if layer_attn_type == "inter": with tf.variable_scope("arg1_arg2"): arg1_to_arg2 = modeling.attention_layer( from_tensor=arg1, to_tensor=arg2, attention_mask=arg2_attn_mask, num_attention_heads=num_attention_heads, size_per_head=size_per_head, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=do_return_2d_tensor, batch_size=batch_size, from_seq_length=arg_seq_length, to_seq_length=arg_seq_length ) with tf.variable_scope("arg2_arg1"): arg2_to_arg1 = modeling.attention_layer( from_tensor=arg2, to_tensor=arg1, attention_mask=arg1_attn_mask, num_attention_heads=num_attention_heads, size_per_head=size_per_head, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=do_return_2d_tensor, batch_size=batch_size, from_seq_length=arg_seq_length, to_seq_length=arg_seq_length ) attention_head = tf.concat([arg1_to_arg2, arg2_to_arg1], axis=1) else: with tf.variable_scope("arg1_arg1"): arg1_to_arg1 = modeling.attention_layer( from_tensor=arg1, to_tensor=arg1, attention_mask=arg1_attn_mask, num_attention_heads=num_attention_heads, size_per_head=size_per_head, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=do_return_2d_tensor, batch_size=batch_size, from_seq_length=arg_seq_length, to_seq_length=arg_seq_length ) with tf.variable_scope("arg2_arg2"): arg2_to_arg2 = modeling.attention_layer( from_tensor=arg2, to_tensor=arg2, attention_mask=arg2_attn_mask, num_attention_heads=num_attention_heads, size_per_head=size_per_head, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=do_return_2d_tensor, batch_size=batch_size, from_seq_length=arg_seq_length, to_seq_length=arg_seq_length ) attention_head = tf.concat([arg1_to_arg1, arg2_to_arg2], axis=1) return attention_head
def create_original_varmisuse_model( bert_config, is_training, enable_sequence_masking, input_ids, input_mask, segment_ids, candidate_mask, target_mask, error_location_mask, use_one_hot_embeddings, multi_head_count = 2, ): """Creates a two-headed pointer model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) final_sequence = model.get_sequence_output() final_sequence_shape = modeling.get_shape_list(final_sequence, expected_rank=3) batch_size, sequence_length, hidden_size = final_sequence_shape cls_output = model.get_pooled_output() # Calculate pointer probabilities as the attention vector over program tokens. # Pointer network equations: # (1) M = tanh(Y * Wy_extend + h_extend * Wh_extend) # (2) multi_headed_alpha = softmax(M * w_extend) # Vector shapes: # (1) M: [batch_size, sequence_length, hidden_size] # (2) Wy: [hidden_size, hidden_size] # (3) Wh: [hidden_size, hidden_size] # (4) h: [batch_size, hidden_size] # (5) Y: [batch_size, sequence_length, hidden_size] # (6) w: [hidden_size, multi_head_count] # (7) multi_headed_alpha: [batch_size, sequence_length, multi_head_count] # (8) Wy_extend: Wy extended to [batch_size, hidden_size, hidden_size] # (9) Wh_extend: Wh extended to [batch_size, hidden_size, hidden_size] # (10) h_extend: h extended to [batch_size, sequence_length, hidden_size] # (11) w_extend: w extended to [batch_size, hidden_size, multi_head_count] wy = tf.get_variable( "Wy", shape=[hidden_size, hidden_size], dtype=tf.float32, initializer=contrib.layers.xavier_initializer()) wh = tf.get_variable( "Wh", shape=[hidden_size, hidden_size], dtype=tf.float32, initializer=contrib.layers.xavier_initializer()) w = tf.get_variable( "w", shape=[hidden_size, multi_head_count], dtype=tf.float32, initializer=contrib.layers.xavier_initializer()) # Dimensions: [batch_size, hidden_size, hidden_size] wy_extend = tf.tile(tf.expand_dims(wy, 0), [batch_size, 1, 1]) # Dimensions: [batch_size, hidden_size, hidden_size] wh_extend = tf.tile(tf.expand_dims(wh, 0), [batch_size, 1, 1]) # Dimensions: [batch_size, sequence_length, hidden_size] cls_output_extend = tf.tile( tf.expand_dims(cls_output, 1), [1, sequence_length, 1]) candidate_mask_expanded = tf.expand_dims(candidate_mask, 2) if enable_sequence_masking: # Mask sequence using `candidate_mask`. candidates_mask_extend = tf.tile(candidate_mask_expanded, [1, 1, hidden_size]) final_sequence_masked = tf.multiply(final_sequence, tf.to_float(candidates_mask_extend)) m = tf.tanh( tf.matmul(final_sequence_masked, wy_extend) + tf.matmul(cls_output_extend, wh_extend)) else: m = tf.tanh( tf.matmul(final_sequence, wy_extend) + tf.matmul(cls_output_extend, wh_extend)) # Dimension: [batch_size, hidden_size, multi_head_count] w_extend = tf.tile(tf.expand_dims(w, 0), [batch_size, 1, 1]) # Dimension: [batch_size, sequence_length, multi_head_count] logits = tf.matmul(m, w_extend) # Dimension: [batch_size, sequence_length, multi_head_count] candidates_mask_extend_to_heads = tf.tile(candidate_mask_expanded, [1, 1, multi_head_count]) # Mask logits using `candidate_mask`. logits_masked = tf.multiply( logits, tf.to_float(candidates_mask_extend_to_heads)) probabilities = tf.nn.softmax(logits_masked, axis=1) location_probabilities, repair_probabilities = tf.unstack( probabilities, axis=2) def compute_loss(labels, probabilities): return -tf.reduce_sum( tf.multiply(tf.to_float(labels), tf.log(tf.clip_by_value(probabilities, 1e-10, 1.0))), axis=1) localization_loss = compute_loss(error_location_mask, location_probabilities) repair_loss = compute_loss(target_mask, repair_probabilities) per_example_loss = localization_loss + repair_loss loss = tf.reduce_mean(per_example_loss) return loss, per_example_loss, logits_masked, probabilities
def __init__(self, config, is_training, input_ids, image_embeddings, input_mask=None, token_type_ids=None, use_one_hot_embeddings=False, scope=None): """Constructor for a visually grounded BertModel. Args: config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. image_embeddings: float32 Tensor of shape [batch_size, seq_length, depth]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. scope: (optional) variable scope. Defaults to "bert". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 text_input_shape = modeling.get_shape_list(input_ids, expected_rank=2) batch_size = text_input_shape[0] text_seq_length = text_input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, text_seq_length], dtype=tf.int32) if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, text_seq_length], dtype=tf.int32) with tf.variable_scope(scope, default_name="bert"): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (self.embedding_output, self.embedding_table) = modeling.embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = modeling.embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) # Add image embeddings the rest of the input embeddings. self.embedding_output += tf.layers.dense( image_embeddings, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range)) with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = modeling.create_attention_mask_from_input_mask( self.embedding_output, input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = modeling.transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range))
def __init__( self, bert_config, char_config, is_training, # is_evaluation, input_token_ids, input_char_ids, labels, num_labels, use_char_representation=True, input_mask=None, segment_ids=None, use_one_hot_embeddings=False, # TPU加速则为True scope=None): """ :param bert_config: :param char_config: :param is_training: 处于estimator模式下的train模式 :param is_evaluation: 处于estimator模式下的evaluate模式 :param input_token_ids: :param input_char_ids: :param labels: 真实标签 :param num_labels: 标签个数,用于CRF的转移矩阵 :param input_mask: :param segment_ids: 用于Bert,不过这里没啥用处,因为只是处理一个ner的问题,所以bert默认都为0 :param use_one_hot_embeddings: 是否用tpu :param scope: """ self.bert_model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_token_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) self.token_output = self.bert_model.get_sequence_output() if use_char_representation: char_embed_dim = char_config['char_embed_dim'] filters = char_config['filters'] alphabet_size = char_config['alphabet_size'] activations = char_config['activations'] n_highway = char_config['n_highway'] projection_dim = char_config['projection_dim'] char_dropout_rate = char_config[ 'char_dropout_rate'] if is_training else 1.0 self.charcnn_model = CharRepresentation( char_input=input_char_ids, alphabet_size=alphabet_size, filters=filters, projection_dim=projection_dim, char_embed_dim=char_embed_dim, activations=activations, n_highway=n_highway, dropout_rate=char_dropout_rate) self.char_output = self.charcnn_model.get_highway_output() token_shape = modeling.get_shape_list(self.token_output, expected_rank=3) char_shape = modeling.get_shape_list(self.char_output, expected_rank=3) if token_shape[1] != char_shape[1]: raise ValueError( "The time steps of token representation (%d) is not the same as char representation (%d) " % (token_shape[1], char_shape[1])) self.final_output = tf.concat( [self.token_output, self.char_output], axis=-1) else: tf.logging.info( "****************BERT representation only***************") self.final_output = self.token_output sequece_lengths = tf.reduce_sum(input_mask, axis=-1) self.crf = CRF( input=self.final_output, labels=labels, num_labels=num_labels, lengths=sequece_lengths, is_training=is_training, # is_evaluation=is_evaluation # estimator模式下的evaluate模式还是需要返回损失函数的 )
for feature in features: feature.unique_id = feature.example_index + feature.context_index + feature.doc_span_index output_fn(feature) return num_spans_to_ids def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model to classify position of start and end token""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable("cls/emr/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("cls/emr/output_bias", [2], initializer=tf.zeros_initializer()) # to be able to use checkpoints from SQUAD trained BERT QA model, replace 'emr' by 'squad' in names of above variables final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias)
def encode_concat_context(self, input_tensor, segment_ids, segment_vocab_size=16, max_position_embeddings=512, hidden_dropout_prob=0.1, initializer_range=0.02, use_segment_ids=False, use_position_embedding=False): """See `embedding_postprocessor` defined in `bert/modeling.py`""" input_shape = modeling.get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor if use_segment_ids: segment_table = tf.get_variable( name="segment_embeddings", shape=[segment_vocab_size, width], initializer=modeling.create_initializer(initializer_range)) flat_segment_ids = tf.reshape(segment_ids, [-1]) # flatten one_hot_ids = tf.one_hot(flat_segment_ids, depth=segment_vocab_size) segment_embeddings = tf.matmul(one_hot_ids, segment_table) segment_embeddings = tf.reshape(segment_embeddings, [batch_size, seq_length, width]) output += segment_embeddings if use_position_embedding: position_embeddings = tf.get_variable( name="position_embeddings", shape=[max_position_embeddings, width], initializer=modeling.create_initializer(initializer_range)) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. position_embeddings = tf.slice(position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(output.shape.as_list()) # Only the last two dimensions are relevant (`seq_length` and `width`), so # we broadcast among the first dimensions, which is typically just # the batch size. position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) output += position_embeddings output = modeling.layer_norm_and_dropout(output, hidden_dropout_prob) return output
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, num_labels, use_one_hot_embeddings, membership_features_str): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer = model.get_pooled_output() final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) hidden_stacked = tf.reshape(final_hidden, [batch_size, seq_length * hidden_size]) # depending on flags, choose the input feature set for classification if membership_features_str == "last_plus_logits": membership_features = tf.concat([hidden_stacked, output_layer, logits], axis=1) elif membership_features_str == "last": membership_features = hidden_stacked elif membership_features_str == "logits": membership_features = tf.concat([output_layer, logits], axis=1) num_membership_features = modeling.get_shape_list(membership_features, expected_rank=2)[1] membership_weights = tf.get_variable( "cls/membership/weights", [2, num_membership_features], initializer=tf.truncated_normal_initializer(stddev=0.02)) membership_bias = tf.get_variable("cls/membership/bias", [2], initializer=tf.zeros_initializer()) membership_logits = tf.matmul(membership_features, membership_weights, transpose_b=True) membership_logits = tf.nn.bias_add(membership_logits, membership_bias) # return the weights since we only want to optimize them return membership_logits, [membership_weights, membership_bias]
def model_fn(features, labels, mode, params, global_step): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" if tf.equal(global_step, 0): '''logging.info("*** Features ***") for name in sorted(features.keys()): logging.info(" name = %s, shape = %s", name, features[name].shape)''' unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) (start_logits, end_logits, answer_type_logits) = create_model( bert_config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) tvars = tf.trainable_variables() print('tvars', tvars) initialized_variable_names = {} scaffold_fn = None #only initialize graphs with checkpoint the first step in eager mode if tf.equal(global_step, 0): if init_checkpoint: (assignment_map, initialized_variable_names ) = bert_modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: seq_length = bert_modeling.get_shape_list(input_ids)[1] # Computes the loss for positions. def compute_loss(logits, positions): one_hot_positions = tf.one_hot(positions, depth=seq_length, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_mean( tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) return loss # Computes the loss for labels. def compute_label_loss(logits, labels): one_hot_labels = tf.one_hot(labels, depth=len(data.AnswerType), dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_mean( tf.reduce_sum(one_hot_labels * log_probs, axis=-1)) return loss start_positions = features["start_positions"] end_positions = features["end_positions"] answer_types = features["answer_types"] start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) answer_type_loss = compute_label_loss(answer_type_logits, answer_types) total_loss = (start_loss + end_loss + answer_type_loss) / 3.0 return tvars, total_loss
def build_attn_layers(self, input_tensor, attn_mask_concat, intermediate_size=2048, intermediate_act_fn=modeling.gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False): """See `attention_layer` defined in `bert/modeling.py`""" if not self.is_training: hidden_dropout_prob = 0.0 attention_probs_dropout_prob = 0.0 # input tensor shape: [batch, arg_length, BERT_hidden_size] # for example, using default hparams vals: [64, 128, 768] attention_head_size = int(self.hidden_size / self.num_attention_heads) input_shape = modeling.get_shape_list(input_tensor, expected_rank=3) prev_output = input_tensor attention_type_split = self.attention_type.split("_") all_layer_outputs = [] for layer_idx in range(self.num_hidden_layers): with tf.variable_scope(f"layer_{layer_idx}"): layer_input = prev_output if len(attention_type_split) == 3: indexer = layer_idx % 2 else: # len(attention_type_split) == 2: indexer = 0 layer_attn_type = attention_type_split[indexer] tf.logging.info( f"{layer_attn_type.capitalize()} Attention at {layer_idx}th Layer") attention_heads = [] with tf.variable_scope(f"{layer_attn_type}_attn"): attention_head = self.build_attn_layer( input_tensor=input_tensor, attn_mask_concat=attn_mask_concat, layer_attn_type=layer_attn_type, num_attention_heads=self.num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=False ) attention_heads.append(attention_head) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: # In the case where we have other sequences, we just concatenate # them to the self-attention head before the projection. attention_output = tf.concat(attention_heads, axis=-1) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.variable_scope("output"): attention_output = tf.layers.dense( attention_output, self.hidden_size, kernel_initializer=modeling.create_initializer(initializer_range)) attention_output = modeling.dropout(attention_output, hidden_dropout_prob) attention_output = modeling.layer_norm(attention_output + layer_input) # The activation is only applied to the "intermediate" hidden layer. with tf.variable_scope("intermediate"): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=modeling.create_initializer(initializer_range)) # Down-project back to `hidden_size` then add the residual. with tf.variable_scope("output"): layer_output = tf.layers.dense( intermediate_output, self.hidden_size, kernel_initializer=modeling.create_initializer(initializer_range)) layer_output = modeling.dropout(layer_output, hidden_dropout_prob) layer_output = modeling.layer_norm(layer_output + attention_output) prev_output = layer_output all_layer_outputs.append(layer_output) if do_return_all_layers: final_outputs = [] for layer_output in all_layer_outputs: final_output = modeling.reshape_from_matrix(layer_output, input_shape) final_outputs.append(final_output) return final_outputs else: final_output = modeling.reshape_from_matrix(prev_output, input_shape) return final_output
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" logging.info("*** Features ***") for name in sorted(features.keys()): logging.info(" name = %s, shape = %s", name, features[name].shape) unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) (start_logits, end_logits, answer_type_logits) = create_model( bert_config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = bert_modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: seq_length = bert_modeling.get_shape_list(input_ids)[1] # Computes the loss for positions. def compute_loss(logits, positions): one_hot_positions = tf.one_hot(positions, depth=seq_length, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_mean( tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) return loss # Computes the loss for labels. def compute_label_loss(logits, labels): one_hot_labels = tf.one_hot(labels, depth=len(data.AnswerType), dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_mean( tf.reduce_sum(one_hot_labels * log_probs, axis=-1)) return loss start_positions = features["start_positions"] end_positions = features["end_positions"] answer_types = features["answer_types"] start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) answer_type_loss = compute_label_loss(answer_type_logits, answer_types) total_loss = (start_loss + end_loss + answer_type_loss) / 3.0 train_op = bert_optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tf_contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.PREDICT: predictions = { "unique_ids": unique_ids, "start_logits": start_logits, "end_logits": end_logits, "answer_type_logits": answer_type_logits, } output_spec = tf_contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and PREDICT modes are supported: %s" % (mode)) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) unique_ids = features["unique_ids"] input_ids = features["input_ids"] segment_ids = features["segment_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) seq_length = modeling.get_shape_list(input_ids)[1] query_length = FLAGS.max_query_length batch_size = params["batch_size"] _, attention_mask = make_attention_mask(batch_size, query_length, seq_length) with tf.variable_scope("bert") as scope: word_logits = create_model( bert_config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=attention_mask, segment_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, scope=scope) if not is_training: with tf.variable_scope("bert", reuse=True) as scope: output_ids = input_ids word_id = tf.argmax(word_logits, axis=2, output_type=tf.int32) # This operation implements: output_ids[:, 2] = word_id[:, 0] word_id = tf.pad(word_id, [[0, 0], [2, seq_length - query_length]]) output_ids = input_ids + word_id * tf.one_hot( 2, seq_length, dtype=tf.int32) def body(i, ids): """A decoding step.""" word_logits = create_model( bert_config=bert_config, is_training=is_training, input_ids=ids, input_mask=attention_mask, segment_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, scope=scope) word_id = tf.argmax(word_logits, axis=2, output_type=tf.int32) # This operation implements: output_ids[:, 1 + i] = word_id[:, i - 1] word_id = tf.pad(word_id, [[0, 0], [2, seq_length - query_length]]) return [ i + 1, ids + word_id * tf.one_hot(i + 1, seq_length, dtype=tf.int32) ] i0 = tf.constant(2) c = lambda i, _: i < query_length - 1 _, output_ids = tf.while_loop(c, body, loop_vars=[i0, output_ids]) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: # Computes the loss for word prediction. loss = tf.losses.sparse_softmax_cross_entropy( input_ids[:, 2:query_length], word_logits, reduction=tf.losses.Reduction.MEAN) train_op = optimization.create_optimizer(loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.PREDICT: predictions = { "unique_ids": tf.identity(unique_ids), "input_ids": output_ids, "segment_ids": tf.minimum(segment_ids, 1), "input_mask": tf.to_int32(tf.not_equal(output_ids, 0)), "start_positions": tf.identity(features["start_positions"]), "end_positions": tf.identity(features["end_positions"]), "answer_types": tf.identity(features["answer_types"]) } output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and PREDICT modes are supported: %s" % (mode)) return output_spec