def _update_decoded_example(decoded_example, options): """Updates the decoded example, add size to the varlen feature. Args: decoded_example: A tensor dictionary keyed by name. options: An instance of reader_pb2.Reader. Returns: decoded_example: The same instance with content modified. """ token_to_id_fn = token_to_id.TokenToIdLayer( options.vocab_file, options.out_of_vocabulary_token_id) detection_to_id_fn = token_to_id.TokenToIdLayer( options.detection_vocab_file, 0) # Image and bounding boxes. image = decoded_example['img_data'] image_shape = tf.shape(image) detections = Detections( decoded_example.pop('detection_boxes'), detection_to_id_fn(decoded_example.pop('detection_classes')), decoded_example.pop('detection_scores')) decoded_example.update({ 'img_height': image_shape[0], 'img_width': image_shape[1], 'detections': detections.to_dict(), }) # Answer and rationale choices. for i in range(NUM_CHOICES): answer_choice = MixedSequence( token_to_id_fn(decoded_example.pop('answer_choice_%i' % i)), decoded_example.pop('answer_choice_tag_%i' % i)) rationale_choice = MixedSequence( token_to_id_fn(decoded_example.pop('rationale_choice_%i' % i)), decoded_example.pop('rationale_choice_tag_%i' % i)) decoded_example.update({ 'answer_choice_%i' % i: answer_choice.to_dict(), 'rationale_choice_%i' % i: rationale_choice.to_dict(), }) # Question and answer. question = MixedSequence(token_to_id_fn(decoded_example.pop('question')), decoded_example.pop('question_tag')) decoded_example.update({'question': question.to_dict()}) return decoded_example
def initialize(options, dt): if not isinstance(options, model_pb2.Cap2SGPreprocess): raise ValueError('Options has to be a Cap2SGPreprocess proto.') if not isinstance(dt, DataTuple): raise ValueError('Invalid DataTuple object.') # Load GloVe embeddings. glove_dict = _load_glove_data(options.glove_vocabulary_file, options.glove_embedding_file) # Initialize token2id and id2token functions. token2id, id2token = _read_vocabulary(options.vocabulary_file, glove_dict, options.minimum_frequency) dt.vocab_size = len(token2id) dt.token2id_func = token_to_id.TokenToIdLayer(token2id, oov_id=0) dt.id2token_func = id_to_token.IdToTokenLayer(id2token, oov='OOV') # Create word embeddings. dt.dims = options.embedding_dims if options.embedding_trainable: dt.embeddings = tf.get_variable('embeddings', initializer=_initialize_from_glove( glove_dict, token2id, dt.dims), trainable=options.embedding_trainable) else: dt.embeddings = tf.constant(_initialize_from_glove( glove_dict, token2id, dt.dims), name='embeddings') dt.embedding_func = lambda x: tf.nn.embedding_lookup(dt.embeddings, x) # Create class biases. (dt.bias_entity, dt.bias_attribute, dt.bias_relation) = _initialize_biases(dt.embeddings, options.bias_mode) return dt
def __init__(self, model_proto, is_training): super(FinetuneCC, self).__init__(model_proto, is_training) if not isinstance(model_proto, model_pb2.FinetuneCC): raise ValueError('Options has to be an FinetuneCC proto.') options = model_proto self._token_to_id_func = token_to_id.TokenToIdLayer( options.bert_vocab_file, options.bert_unk_token_id) self._bert_config = BertConfig.from_json_file(options.bert_config_file) self._slim_fc_scope = hyperparams.build_hyperparams( options.fc_hyperparams, is_training)() if options.rationale_model: self._field_label = InputFields.rationale_label self._field_choices = InputFields.rationale_choices_with_question self._field_choices_tag = InputFields.rationale_choices_with_question_tag self._field_choices_len = InputFields.rationale_choices_with_question_len else: self._field_label = InputFields.answer_label self._field_choices = InputFields.answer_choices_with_question self._field_choices_tag = InputFields.answer_choices_with_question_tag self._field_choices_len = InputFields.answer_choices_with_question_len
def _get_class_embedding_vectors(label, vocab_file, vocab_size, embedding_dims=300, scope='object_embedding', max_norm=None): """Gets token embedding vectors. Args: label: A string tensor. vocab_file: Path to the vocabulary file. vocab_size: Size of the vocabulary. embedding_dims: Dimensions of the embedding vectors. Returns: label_embedding: Embedding of the label. """ label_ids = token_to_id.TokenToIdLayer(vocab_file, unk_token_id=vocab_size)(label) with tf.variable_scope(scope): object_embedding = tf.get_variable( 'weights', shape=[vocab_size + 1, embedding_dims], trainable=True) return tf.nn.embedding_lookup(object_embedding, label_ids, max_norm=max_norm)
def _get_class_embedding_vectors(label, vocab_file, embeddings_index, init_width=0.03): """Gets token embedding vectors. Args: label: A string tensor. vocab_file: Path to the vocabulary file. embedding_dims: Dimensions of the embedding vectors. Returns: label_embedding: Embedding of the label. """ embedding_matrix = _create_embedding_matrix(embeddings_index, vocab_file, init_width) unk_token_id, embedding_dims = embedding_matrix.shape embedding_matrix = np.concatenate([ embedding_matrix, np.random.uniform(-init_width, init_width, (1, embedding_dims)).astype(np.float32) ]) embedding = tf.get_variable('object/embedding', initializer=embedding_matrix, trainable=True) label_ids = token_to_id.TokenToIdLayer(vocab_file, unk_token_id=unk_token_id)(label) return tf.nn.embedding_lookup(embedding, label_ids, max_norm=_MAX_NORM)
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto token_to_id_layer = token_to_id.TokenToIdLayer( options.bert_vocab_file, options.bert_unk_token_id) bert_config = BertConfig.from_json_file(options.bert_config_file) slim_fc_scope = hyperparams.build_hyperparams(options.fc_hyperparams, is_training)() # Prediction. answer_logits = self._predict_logits( inputs[self._field_answer_choices], inputs[self._field_answer_choices_len], token_to_id_layer, bert_config, slim_fc_scope, options.dropout_keep_prob, is_training) # Restore from checkpoint. assignment_map, _ = get_assignment_map_from_checkpoint( tf.global_variables(), options.bert_checkpoint_file) tf.compat.v1.train.init_from_checkpoint(options.bert_checkpoint_file, assignment_map) return { FIELD_ANSWER_PREDICTION: answer_logits, }
def test_token_to_id(self): test_layer = token_to_id.TokenToIdLayer({'hello': 5, 'world': 11}, 97) output = test_layer(tf.convert_to_tensor(['hello', ',', 'world', '!'])) self.assertAllEqual(output, [5, 97, 11, 97]) output = test_layer(tf.convert_to_tensor(['hell', ',', 'world', '!!'])) self.assertAllEqual(output, [97, 97, 11, 97])
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto (answer_choices, answer_choices_len, answer_label) = (inputs[InputFields.answer_choices_with_question], inputs[InputFields.answer_choices_with_question_len], inputs[InputFields.answer_label]) # Create model layers. token_to_id_layer = token_to_id.TokenToIdLayer( options.bert_vocab_file, options.bert_unk_token_id) # Convert tokens into token ids. batch_size = answer_choices.shape[0] answer_choices_token_ids = token_to_id_layer(answer_choices) answer_choices_token_ids_reshaped = tf.reshape( answer_choices_token_ids, [batch_size * NUM_CHOICES, -1]) answer_choices_mask = tf.sequence_mask( answer_choices_len, maxlen=tf.shape(answer_choices)[-1]) answer_choices_mask_reshaped = tf.reshape( answer_choices_mask, [batch_size * NUM_CHOICES, -1]) # Bert prediction. bert_config = BertConfig.from_json_file(options.bert_config_file) bert_model = BertModel(bert_config, is_training, input_ids=answer_choices_token_ids_reshaped, input_mask=answer_choices_mask_reshaped) answer_choices_cls_feature_reshaped = bert_model.get_pooled_output() answer_choices_cls_feature = tf.reshape( answer_choices_cls_feature_reshaped, [batch_size, NUM_CHOICES, -1]) assignment_map, _ = get_assignment_map_from_checkpoint( tf.global_variables(), options.bert_checkpoint_file) tf.compat.v1.train.init_from_checkpoint(options.bert_checkpoint_file, assignment_map) # Classification layer. output = tf.compat.v1.layers.dense(answer_choices_cls_feature, units=1, activation=None) output = tf.squeeze(output, axis=-1) return {FIELD_ANSWER_PREDICTION: output}
def test_token_to_id_bert(self): test_layer = token_to_id.TokenToIdLayer( 'data/bert/keras/cased_L-12_H-768_A-12/vocab.txt', unk_token_id=100) output = test_layer(tf.convert_to_tensor(['hello', ',', 'world', '!'])) self.assertAllEqual(output, [19082, 117, 1362, 106]) output = test_layer(tf.convert_to_tensor(['hello', ',', 'world', '!!'])) self.assertAllEqual(output, [19082, 117, 1362, 100])
def test_token_to_id_2d(self): vocab_file = self._create_temp_vocab_file() test_layer = token_to_id.TokenToIdLayer(vocab_file, unk_token_id=4) output = test_layer( tf.convert_to_tensor([['hello', ',', 'world', '!'], ['hell', ',', 'world', '!!']])) self.assertAllEqual(output, [[2, 6, 3, 7], [4, 6, 3, 4]]) os.unlink(vocab_file) self.assertFalse(os.path.exists(vocab_file))
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto (answer_choices, answer_choices_len, answer_label) = (inputs[InputFields.answer_choices_with_question], inputs[InputFields.answer_choices_with_question_len], inputs[InputFields.answer_label]) batch_size = answer_choices.shape[0] # Convert tokens to ids. token_to_id_layer = token_to_id.TokenToIdLayer(options.vocab_file, options.unk_token_id) answer_choices_token_ids = token_to_id_layer(answer_choices) answer_choices_token_ids_reshaped = tf.reshape( answer_choices_token_ids, [batch_size * NUM_CHOICES, -1]) # Convert word ids to embedding vectors. glove_embedding_array = create_embedding_matrix( options.glove_file, options.vocab_file) embedding = tf.get_variable('word/embedding', initializer=glove_embedding_array, trainable=True) answer_choices_embs_reshaped = tf.nn.embedding_lookup( embedding, answer_choices_token_ids_reshaped, max_norm=None) # Encode the sequence using BiLSTM model. with tf.variable_scope('answer_choice_encoder'): _, answer_choices_feature_reshaped = rnn.RNN( answer_choices_embs_reshaped, tf.reshape(answer_choices_len, [batch_size * NUM_CHOICES]), options.rnn_config, is_training=is_training) answer_choices_feature = tf.reshape(answer_choices_feature_reshaped, [batch_size, NUM_CHOICES, -1]) # Classification layer. output = tf.compat.v1.layers.dense(answer_choices_feature, units=1, activation=None) output = tf.squeeze(output, axis=-1) return {FIELD_ANSWER_PREDICTION: output}
def test_token_to_id_2d(self): test_layer = token_to_id.TokenToIdLayer( { 'one': 2, 'world': 3, 'dream': 5 }, 4) output = test_layer( tf.convert_to_tensor([['hello', ',', 'world', '!'], ['hell', ',', 'world', '!!']])) self.assertAllEqual(output, [[4, 4, 3, 4], [4, 4, 3, 4]]) output = test_layer( tf.convert_to_tensor([['one', 'world', 'one', 'dream'], ['one', 'word', 'one', 'dream']])) self.assertAllEqual(output, [[2, 3, 2, 5], [2, 4, 2, 5]])
def test_masked_lm(self): example_sentence = [ 'alice', 'became', '[MASK]', 'after', 'felt', 'left', 'out', 'by', 'her', 'friends.' ] num_token_predictions = 1 lm_mask = [2] bert_unk_token_id = 100 bert_dir = 'data/bert/keras/cased_L-12_H-768_A-12' bert_vocab_file = "{}/vocab.txt".format(bert_dir) bert_config_file = "{}/bert_config.json".format(bert_dir) bert_checkpoint_file = "{}/bert_model.ckpt".format(bert_dir) num_classes = 2 sequence_length = len(example_sentence) vocab = load_vocab(bert_vocab_file) token_to_id_layer = token_to_id.TokenToIdLayer(bert_vocab_file, bert_unk_token_id) bert_config = BertConfig.from_json_file(bert_config_file) transformer_encoder = get_transformer_encoder(bert_config, sequence_length) pretrainer_model = bert_pretrainer.BertPretrainer( network=transformer_encoder, num_classes=num_classes, num_token_predictions=num_token_predictions, output='predictions') checkpoint = tf.train.Checkpoint(model=transformer_encoder) status = checkpoint.restore(bert_checkpoint_file) with tf.compat.v1.Session() as sess: status.initialize_or_restore(sess) values = sess.run(transformer_encoder.trainable_variables) print(values[-1]) j = 1
def _predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto token_to_id_layer = token_to_id.TokenToIdLayer( options.embedding_vocab_file, options.embedding_unk_token_id) predictions = {} # Extract text annotations. (question, question_len, answer, answer_len) = (inputs[InputFields.question], inputs[InputFields.question_len], inputs[InputFields.answer_choices], inputs[InputFields.answer_choices_len]) batch_size = question.shape[0] # Convert word to embedding vectors. (question_token_ids, answer_token_ids) = (token_to_id_layer(question), token_to_id_layer(answer)) glove_embedding_array = _create_embedding_matrix( options.embedding_glove_file, options.embedding_vocab_file) embedding = tf.get_variable('word/embedding', initializer=glove_embedding_array, trainable=True) question_embs = tf.nn.embedding_lookup(embedding, question_token_ids, max_norm=None) answer_embs = tf.nn.embedding_lookup(embedding, answer_token_ids, max_norm=None) # Trim lengths of the object arrays to `max_num_objects`. (num_objects, object_bboxes, object_labels, object_scores, object_features, max_num_objects) = _trim_to_max_num_objects( inputs[InputFields.num_objects], inputs[InputFields.object_bboxes], inputs[InputFields.object_labels], inputs[InputFields.object_scores], inputs[InputFields.object_features], max_num_objects=options.max_num_objects) question_tags = _assign_invalid_tags(inputs[InputFields.question_tag], max_num_objects) answer_tags = _assign_invalid_tags( inputs[InputFields.answer_choices_tag], max_num_objects) # Merge class label embeddings to the Fast-RCNN features. object_embeddings = _get_class_embedding_vectors( object_labels, options.label_file, options.label_vocab_size, options.label_embedding_dims) object_features = _project_object_features( object_features, object_embeddings, output_dims=options.visual_feature_dims, dropout_keep_prob=options.dropout_keep_prob, is_training=is_training) # Reshape answer-related tensors # to the shape of [batch_size * NUM_CHOICES, max_seq_len, ...]. (question_embs, question_tags, question_len, answer_embs, answer_tags, answer_len) = _reshape_answer_related_tensors(question_embs, question_tags, question_len, answer_embs, answer_tags, answer_len) # Ground both the question and the answer choices. question_object_features = _ground_tag_using_object_features( object_features, question_tags) answer_object_features = _ground_tag_using_object_features( object_features, answer_tags) question_rnn_inputs = tf.concat( [question_embs, question_object_features], -1) answer_rnn_inputs = tf.concat([answer_embs, answer_object_features], -1) # Build the recognition to cognition model. final_features, answer_seq_features = self._recognition_to_cognition( question_rnn_inputs, question_len, answer_rnn_inputs, answer_len, object_features, num_objects, predictions) # Compute the joint representation. with tf.variable_scope('classification'): with tf.variable_scope('hidden'): output = tf.contrib.layers.fully_connected( final_features, num_outputs=1024, activation_fn=tf.nn.relu) output = tf.contrib.layers.dropout( output, keep_prob=options.dropout_keep_prob, is_training=is_training) with tf.variable_scope('output'): output = tf.contrib.layers.fully_connected(output, num_outputs=1, activation_fn=None) output = tf.reshape(output, [batch_size, NUM_CHOICES]) predictions.update({ FIELD_ANSWER_PREDICTION: output, 'image_id': inputs[InputFields.img_id] }) return predictions
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto fc_scope_fn = hyperparams.build_hyperparams(options.fc_hyperparams, is_training) (answer_choices, answer_choices_tag, answer_choices_len, answer_label) = (inputs[InputFields.answer_choices_with_question], inputs[InputFields.answer_choices_with_question_tag], inputs[InputFields.answer_choices_with_question_len], inputs[InputFields.answer_label]) batch_size = answer_choices.shape[0] # Trim lengths of the object arrays to `max_num_objects`. (num_objects, object_bboxes, object_labels, object_scores, object_features, max_num_objects) = _trim_to_max_num_objects( inputs[InputFields.num_objects], inputs[InputFields.object_bboxes], inputs[InputFields.object_labels], inputs[InputFields.object_scores], inputs[InputFields.object_features], max_num_objects=options.max_num_objects) answer_choices_tag = _assign_invalid_tags(answer_choices_tag, max_num_objects) # Merge class label embeddings to the Fast-RCNN features. object_features = _project_object_features( object_features, output_dims=options.visual_feature_dims, dropout_keep_prob=options.dropout_keep_prob, is_training=is_training) object_feature_dims = object_features.shape[-1] # Convert tokens into token ids. token_to_id_layer = token_to_id.TokenToIdLayer( options.bert_vocab_file, options.bert_unk_token_id) answer_choices_token_ids = token_to_id_layer(answer_choices) answer_choices_token_ids = tf.reshape(answer_choices_token_ids, [batch_size * NUM_CHOICES, -1]) answer_choices_mask = tf.sequence_mask( answer_choices_len, maxlen=tf.shape(answer_choices)[-1]) answer_choices_mask = tf.reshape(answer_choices_mask, [batch_size * NUM_CHOICES, -1]) # Create tag features sequence. answer_choices_tag = tf.reshape(answer_choices_tag, [batch_size * NUM_CHOICES, -1]) answer_choices_tag_features = _ground_tag_using_object_features( object_features, answer_choices_tag) # Convert class labels into token ids, tile object features. object_mask = tf.sequence_mask(num_objects, maxlen=tf.shape(object_labels)[-1]) object_mask = tf.gather(tf.expand_dims(object_mask, 1), [0] * NUM_CHOICES, axis=1) object_mask = tf.reshape(object_mask, [batch_size * NUM_CHOICES, -1]) object_label_token_ids = token_to_id_layer(object_labels) object_label_token_ids = tf.gather(tf.expand_dims( object_label_token_ids, 1), [0] * NUM_CHOICES, axis=1) object_label_token_ids = tf.reshape(object_label_token_ids, [batch_size * NUM_CHOICES, -1]) object_features = tf.gather(tf.expand_dims(object_features, 1), [0] * NUM_CHOICES, axis=1) object_features = tf.reshape( object_features, [batch_size * NUM_CHOICES, -1, object_feature_dims]) # Create Bert model. input_ids = tf.concat( [answer_choices_token_ids, object_label_token_ids], -1) input_tag_features = tf.concat( [answer_choices_tag_features, object_features], 1) input_mask = tf.concat([answer_choices_mask, object_mask], -1) final_features = self._bert_model(input_ids, input_tag_features, input_mask) # Classification layer. with slim.arg_scope(fc_scope_fn()): output = tf.contrib.layers.fully_connected(final_features, num_outputs=1, activation_fn=None) output = tf.reshape(output, [batch_size, NUM_CHOICES]) return {FIELD_ANSWER_PREDICTION: output}
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto (question, question_len, answer_choices, answer_choices_len, answer_label) = (inputs[InputFields.question], inputs[InputFields.question_len], inputs[InputFields.answer_choices], inputs[InputFields.answer_choices_len], inputs[InputFields.answer_label]) # Create model layers. token_to_id_layer = token_to_id.TokenToIdLayer(options.vocab_file, options.unk_token_id) embeddings_initializer = 'uniform' if options.glove_file: embeddings_initializer = tf.keras.initializers.Constant( create_embedding_matrix(options.glove_file, options.vocab_file, options.embedding_dims)) embedding_layer = tf.keras.layers.Embedding( options.vocab_size, options.embedding_dims, embeddings_initializer=embeddings_initializer) question_lstm_layer = tf.keras.layers.Bidirectional( tf.keras.layers.LSTM(options.lstm_units, dropout=options.lstm_dropout, recurrent_dropout=options.lstm_recurrent_dropout, return_state=True), name='question_bidirectional') answer_choice_lstm_layer = tf.keras.layers.Bidirectional( tf.keras.layers.LSTM(options.lstm_units, dropout=options.lstm_dropout, recurrent_dropout=options.lstm_recurrent_dropout), name='answer_bidirectional') # Convert tokens into embeddings. batch_size = answer_choices.shape[0] (question_token_ids, answer_choices_token_ids) = (token_to_id_layer(question), token_to_id_layer(answer_choices)) (question_embs, answer_choices_embs) = (embedding_layer(question_token_ids), embedding_layer(answer_choices_token_ids)) # Question LSTM encoder. question_mask = tf.sequence_mask(question_len, maxlen=tf.shape(question)[-1]) question_outputs = question_lstm_layer(question_embs, mask=question_mask, training=is_training) question_feature, question_states = (question_outputs[0], question_outputs[1:]) question_states_tiled = [] for question_state in question_states: question_state = tf.gather(tf.expand_dims(question_state, axis=1), indices=[0] * NUM_CHOICES, axis=1) question_states_tiled.append( tf.reshape(question_state, [-1, question_state.shape[-1]])) # Answer LSTM encoder. answer_choices_mask = tf.sequence_mask(answer_choices_len, maxlen=tf.shape(answer_choices)[-1]) answer_choices_mask_reshaped = tf.reshape(answer_choices_mask, [batch_size * NUM_CHOICES, -1]) answer_choices_embs_reshaped = tf.reshape( answer_choices_embs, [batch_size * NUM_CHOICES, -1, options.embedding_dims]) answer_choices_feature_reshaped = answer_choice_lstm_layer( answer_choices_embs_reshaped, mask=answer_choices_mask_reshaped, training=is_training, initial_state=question_states_tiled if options.text_feature == model_pb2.QUESTION_AND_ANSWER else None) answer_choices_feature = tf.reshape(answer_choices_feature_reshaped, [batch_size, NUM_CHOICES, -1]) output = tf.keras.layers.Dense(1, activation=None)(answer_choices_feature) output = tf.squeeze(output, axis=-1) return {FIELD_ANSWER_PREDICTION: output}
def main(_): logging.set_verbosity(logging.INFO) for i in range(_NUM_PARTITIONS): tf.io.gfile.makedirs( os.path.join(FLAGS.output_bert_feature_dir, '%02d' % i)) # Create Bert model. bert_tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.bert_vocab_file, do_lower_case=FLAGS.do_lower_case) # Bert prediction. input_placeholder = tf.placeholder(shape=[None], dtype=tf.string) token_to_id_layer = token_to_id.TokenToIdLayer(FLAGS.bert_vocab_file, unk_token_id=UNK) bert_config = BertConfig.from_json_file(FLAGS.bert_config_file) bert_model = BertModel(bert_config, is_training=False, input_ids=token_to_id_layer( tf.expand_dims(input_placeholder, 0))) sequence_output = bert_model.get_sequence_output()[0] pooled_output = bert_model.get_pooled_output()[0] saver = tf.compat.v1.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.compat.v1.Session(config=config) sess.run(tf.compat.v1.tables_initializer()) saver.restore(sess, FLAGS.bert_checkpoint_file) for name in sess.run(tf.compat.v1.report_uninitialized_variables()): logging.warn('%s is uninitialized!', name) def _bert_fn(sequence): return sess.run([sequence_output, pooled_output], feed_dict={input_placeholder: sequence}) # Load annotations. annots = _load_annotations(FLAGS.annotations_jsonl_file) logging.info('Loaded %i annotations.', len(annots)) shard_id, num_shards = FLAGS.shard_id, FLAGS.num_shards assert 0 <= shard_id < num_shards for idx, annot in enumerate(annots): if (idx + 1) % 1000 == 0: logging.info('On example %i/%i.', idx + 1, len(annots)) annot_id = int(annot['annot_id'].split('-')[-1]) if annot_id % num_shards != shard_id: continue # Check npy file. part_id = get_partition_id(annot['annot_id']) output_file = os.path.join(FLAGS.output_bert_feature_dir, '%02d' % part_id, annot['annot_id'] + '.npy') if os.path.isfile(output_file): logging.info('%s is there.', output_file) continue annot_id = int(annot['annot_id'].split('-')[-1]) if annot_id % num_shards != shard_id: continue # Create TF example. bert_outputs = _create_bert_embeddings(annot, bert_tokenizer, FLAGS.do_lower_case, _bert_fn) with open(output_file, 'wb') as f: np.save(f, bert_outputs) logging.info('Done')
def _update_decoded_example(decoded_example, options): """Updates the decoded example, add size to the varlen feature. Args: decoded_example: A tensor dictionary keyed by name. options: An instance of reader_pb2.Reader. Returns: decoded_example: The same instance with content modified. """ token_to_id_func = token_to_id.TokenToIdLayer( options.vocab_file, options.out_of_vocabulary_token_id) # Number of objects. detection_boxes = decoded_example[InputFields.detection_boxes] detection_classes = decoded_example[InputFields.detection_classes] num_detections = tf.shape(detection_boxes)[0] # Question length. question = decoded_example[InputFields.question] question_tag = decoded_example[InputFields.question_tag] question_len = tf.shape(question)[0] # Answer and rationale choices. answer_choices_list = [ decoded_example.pop(TFExampleFields.answer_choice + '_%i' % i) for i in range(1, 1 + NUM_CHOICES) ] answer_choices_tag_list = [ decoded_example.pop(TFExampleFields.answer_choice_tag + '_%i' % i) for i in range(1, 1 + NUM_CHOICES) ] (answer_choices, answer_choices_len) = _pad_sequences(answer_choices_list) (answer_choices_tag, _) = _pad_sequences(answer_choices_tag_list, -1) rationale_choices_list = [ decoded_example.pop(TFExampleFields.rationale_choice + '_%i' % i) for i in range(1, 1 + NUM_CHOICES) ] rationale_choices_tag_list = [ decoded_example.pop(TFExampleFields.rationale_choice_tag + '_%i' % i) for i in range(1, 1 + NUM_CHOICES) ] (rationale_choices, rationale_choices_len) = _pad_sequences(rationale_choices_list) (rationale_choices_tag, _) = _pad_sequences(rationale_choices_tag_list, -1) # Mixed question -> answer, question-answer -> rationale. answer_len = answer_choices_len[decoded_example[InputFields.answer_label]] answer = answer_choices[decoded_example[ InputFields.answer_label]][:answer_len] answer_tag = answer_choices_tag[decoded_example[ InputFields.answer_label]][:answer_len] mixed_answer_choices_list = [ tf.concat([question, ['[SEP]'], x], 0) for x in answer_choices_list ] mixed_answer_choices_tag_list = [ tf.concat([question_tag, [-1], x], 0) for x in answer_choices_tag_list ] (mixed_answer_choices, mixed_answer_choices_len) = _pad_sequences(mixed_answer_choices_list) (mixed_answer_choices_tag, _) = _pad_sequences(mixed_answer_choices_tag_list, pad=-1) mixed_rationale_choices_list = [ tf.concat([question, ['[SEP]'], answer, ['[SEP]'], x], 0) for x in rationale_choices_list ] mixed_rationale_choices_tag_list = [ tf.concat([question_tag, [-1], answer_tag, [-1], x], 0) for x in rationale_choices_tag_list ] (mixed_rationale_choices, mixed_rationale_choices_len) = _pad_sequences(mixed_rationale_choices_list) (mixed_rationale_choices_tag, _) = _pad_sequences(mixed_rationale_choices_tag_list, pad=-1) # Image shape. image = decoded_example[InputFields.img_data] image_shape = tf.shape(image) # min_size = tf.reduce_min(image_shape[:2]) # scale = 1.0 * options.desired_size / tf.cast(min_size, dtype=tf.float32) # def resize_fn(): # new_height = scale * tf.cast(image_shape[0], dtype=tf.float32) # new_width = scale * tf.cast(image_shape[1], dtype=tf.float32) # new_height = tf.cast(new_height, dtype=tf.int32) # new_width = tf.cast(new_width, dtype=tf.int32) # resized_image = tf.image.resize(image, # size=tf.stack([new_height, new_width], 0)) # return tf.cast(resized_image, dtype=tf.uint8) # image = tf.cond(scale >= 1.0, true_fn=lambda: image, false_fn=resize_fn) # image_shape = tf.shape(image) decoded_example.update({ InputFields.img_data: image, InputFields.img_height: image_shape[0], InputFields.img_width: image_shape[1], InputFields.num_detections: num_detections, InputFields.detection_classes: token_to_id_func(detection_classes), InputFields.question: tf.tile(tf.expand_dims(token_to_id_func(question), 0), [NUM_CHOICES, 1]), InputFields.question_tag: tf.tile(tf.expand_dims(question_tag, 0), [NUM_CHOICES, 1]), InputFields.question_len: tf.tile(tf.expand_dims(question_len, 0), [NUM_CHOICES]), InputFields.answer_len: tf.tile(tf.expand_dims(answer_len, 0), [NUM_CHOICES]), InputFields.answer_choices: token_to_id_func(answer_choices), InputFields.answer_choices_tag: answer_choices_tag, InputFields.answer_choices_len: answer_choices_len, InputFields.rationale_choices: token_to_id_func(rationale_choices), InputFields.rationale_choices_tag: rationale_choices_tag, InputFields.rationale_choices_len: rationale_choices_len, InputFields.mixed_answer_choices: token_to_id_func(mixed_answer_choices), InputFields.mixed_answer_choices_tag: mixed_answer_choices_tag, InputFields.mixed_answer_choices_len: mixed_answer_choices_len, InputFields.mixed_rationale_choices: token_to_id_func(mixed_rationale_choices), InputFields.mixed_rationale_choices_tag: mixed_rationale_choices_tag, InputFields.mixed_rationale_choices_len: mixed_rationale_choices_len, }) return decoded_example
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto (answer_choices, answer_choices_len, answer_label) = (inputs[InputFields.answer_choices_with_question], inputs[InputFields.answer_choices_with_question_len], inputs[InputFields.answer_label]) # Create model layers. token_to_id_layer = token_to_id.TokenToIdLayer(options.bert_vocab_file, options.bert_unk_token_id) bert_config = BertConfig.from_json_file(options.bert_config_file) self.transformer_encoder = get_transformer_encoder(bert_config, None) checkpoint = tf.train.Checkpoint(model=self.transformer_encoder) self.transformer_encoder_load_status = checkpoint.restore( options.bert_checkpoint_file) answer_choice_lstm_layer = tf.keras.layers.Bidirectional( tf.keras.layers.LSTM(options.lstm_units, dropout=options.lstm_dropout, recurrent_dropout=options.lstm_recurrent_dropout), name='answer_bidirectional') # Convert tokens into embeddings. batch_size = answer_choices.shape[0] answer_choices_token_ids = token_to_id_layer(answer_choices) answer_choices_token_ids_reshaped = tf.reshape( answer_choices_token_ids, [batch_size * NUM_CHOICES, -1]) # Answer BiLSTM encoder. answer_choices_mask = tf.sequence_mask(answer_choices_len, maxlen=tf.shape(answer_choices)[-1]) answer_choices_mask_reshaped = tf.reshape(answer_choices_mask, [batch_size * NUM_CHOICES, -1]) answer_choices_embs_reshaped, _ = self.transformer_encoder( [ answer_choices_token_ids_reshaped, answer_choices_mask_reshaped, tf.zeros_like(answer_choices_token_ids_reshaped, dtype=tf.int32) ], training=is_training) answer_choices_feature_reshaped = answer_choice_lstm_layer( answer_choices_embs_reshaped, mask=answer_choices_mask_reshaped, training=is_training) answer_choices_feature = tf.reshape(answer_choices_feature_reshaped, [batch_size, NUM_CHOICES, -1]) output = tf.keras.layers.Dense(1, activation=None)(answer_choices_feature) output = tf.squeeze(output, axis=-1) return {FIELD_ANSWER_PREDICTION: output}
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto (num_objects, object_bboxes, object_labels, object_scores, object_features) = (inputs[InputFields.num_objects], inputs[InputFields.object_bboxes], inputs[InputFields.object_labels], inputs[InputFields.object_scores], inputs[InputFields.object_features]) (answer_choices, answer_choices_len, answer_label) = (inputs[InputFields.answer_choices_with_question], inputs[InputFields.answer_choices_with_question_len], inputs[InputFields.answer_label]) batch_size = answer_choices.shape[0] # Image feature. object_masks = tf.sequence_mask(num_objects, tf.shape(object_bboxes)[1], dtype=tf.float32) # object_features = tf.compat.v1.layers.dense(object_features, # units=512, # activation=tf.nn.tanh) image_feature = masked_ops.masked_avg_nd(object_features, object_masks, dim=1) # Convert tokens to ids. token_to_id_layer = token_to_id.TokenToIdLayer(options.vocab_file, options.unk_token_id) answer_choices_token_ids = token_to_id_layer(answer_choices) answer_choices_token_ids_reshaped = tf.reshape( answer_choices_token_ids, [batch_size * NUM_CHOICES, -1]) # Convert word ids to embedding vectors. glove_embedding_array = create_embedding_matrix( options.glove_file, options.vocab_file) embedding = tf.get_variable('word/embedding', initializer=glove_embedding_array, trainable=True) answer_choices_embs_reshaped = tf.nn.embedding_lookup( embedding, answer_choices_token_ids_reshaped, max_norm=None) # Encode the sequence using BiLSTM model. with tf.variable_scope('answer_choice_encoder'): _, answer_choices_feature_reshaped = rnn.RNN( answer_choices_embs_reshaped, tf.reshape(answer_choices_len, [batch_size * NUM_CHOICES]), options.rnn_config, is_training=is_training) answer_choices_feature = tf.reshape(answer_choices_feature_reshaped, [batch_size, NUM_CHOICES, -1]) inputs = tf.concat([ answer_choices_feature, tf.tile(image_feature, [1, NUM_CHOICES, 1]) ], -1) output = tf.compat.v1.layers.dense(inputs, units=512, activation=tf.nn.relu6) output = tf.compat.v1.layers.dense(inputs, units=1, activation=None) output = tf.squeeze(output, axis=-1) return {FIELD_ANSWER_PREDICTION: output}
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ options = self._model_proto is_training = self._is_training token_to_id_layer = token_to_id.TokenToIdLayer(options.vocab_file, options.unk_token_id) fc_scope_fn = hyperparams.build_hyperparams(options.fc_hyperparams, is_training) # Extract input fields. (question, question_len, answer_choices, answer_choices_len) = (inputs[InputFields.question], inputs[InputFields.question_len], inputs[InputFields.answer_choices], inputs[InputFields.answer_choices_len]) batch_size = answer_choices.shape[0] # Convert question tokens into token ids. question_token_ids = token_to_id_layer(question) # Convert answer choice tokens into token ids. answer_choices_token_ids = token_to_id_layer(answer_choices) answer_choices_token_ids = tf.reshape(answer_choices_token_ids, [batch_size * NUM_CHOICES, -1]) answer_choices_len = tf.reshape(answer_choices_len, [batch_size * NUM_CHOICES]) # Convert word ids to embedding vectors. glove_embedding_array = create_embedding_matrix(options.glove_file, options.vocab_file) embedding = tf.get_variable('word/embedding', initializer=glove_embedding_array, trainable=True) question_embs = tf.nn.embedding_lookup(embedding, question_token_ids, max_norm=None) answer_choices_embs = tf.nn.embedding_lookup(embedding, answer_choices_token_ids, max_norm=None) # Tile the question embeddings. question_embs = tf.gather(tf.expand_dims(question_embs, 1), [0] * NUM_CHOICES, axis=1) question_embs = tf.reshape( question_embs, [batch_size * NUM_CHOICES, -1, question_embs.shape[-1]]) question_len = tf.gather(tf.expand_dims(question_len, 1), [0] * NUM_CHOICES, axis=1) question_len = tf.reshape(question_len, [batch_size * NUM_CHOICES]) # Encode the sequence using BiLSTM model. with tf.variable_scope('question_encoder'): _, question_features = rnn.RNN(question_embs, question_len, options.rnn_config, is_training=is_training) with tf.variable_scope('answer_choice_encoder'): _, answer_features = rnn.RNN(answer_choices_embs, answer_choices_len, options.rnn_config, is_training=is_training) final_features = tf.concat( [answer_features, answer_features * question_features], axis=-1) # MLP. with slim.arg_scope(fc_scope_fn()): with tf.variable_scope('classification'): with tf.variable_scope('hidden'): output = tf.contrib.layers.fully_connected(final_features, num_outputs=1024, activation_fn=tf.nn.relu) output = tf.contrib.layers.dropout( output, keep_prob=options.dropout_keep_prob, is_training=is_training) with tf.variable_scope('output'): output = tf.contrib.layers.fully_connected(output, num_outputs=1, activation_fn=None) output = tf.reshape(output, [batch_size, NUM_CHOICES]) return { FIELD_ANSWER_PREDICTION: output, }
def _predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto token_to_id_layer = token_to_id.TokenToIdLayer( options.embedding_vocab_file, options.embedding_unk_token_id) predictions = {} # Extract text annotations. question_len = inputs[InputFields.question_len] question_token_ids = token_to_id_layer(inputs[InputFields.question]) answer_len = inputs[InputFields.answer_choices_len] answer_token_ids = token_to_id_layer(inputs[InputFields.answer_choices]) batch_size = question_token_ids.shape[0] # Load GloVe data. embeddings_index = _load_embeddings(options.embedding_glove_file) embedding_dims = embeddings_index['the'].shape[-1] # Convert word to embedding vectors. embedding = tf.get_variable('word/embedding', initializer=_create_embedding_matrix( embeddings_index, options.embedding_vocab_file), trainable=True) embed_fn = lambda x: tf.nn.embedding_lookup(embedding, x, max_norm=_MAX_NORM) question_embs = embed_fn(question_token_ids) answer_embs = embed_fn(answer_token_ids) # Trim lengths of the object arrays to `max_num_objects`. (num_objects, object_bboxes, object_labels, object_scores, object_features, max_num_objects) = _trim_to_max_num_objects( inputs[InputFields.num_objects], inputs[InputFields.object_bboxes], inputs[InputFields.object_labels], inputs[InputFields.object_scores], inputs[InputFields.object_features], max_num_objects=options.max_num_objects) question_tags = _assign_invalid_tags(inputs[InputFields.question_tag], max_num_objects) answer_tags = _assign_invalid_tags(inputs[InputFields.answer_choices_tag], max_num_objects) # Merge class label embeddings to the Fast-RCNN features. object_features = _project_object_features( object_features, object_embeddings=_get_class_embedding_vectors(object_labels, options.label_file, embeddings_index), output_dims=options.visual_feature_dims, dropout_keep_prob=options.dropout_keep_prob, is_training=is_training) # Reshape answer-related tensors # to the shape of [batch_size * NUM_CHOICES, max_seq_len, ...]. (question_embs, question_tags, question_len, answer_embs, answer_tags, answer_len) = _reshape_answer_related_tensors(question_embs, question_tags, question_len, answer_embs, answer_tags, answer_len) # Adversarial masking. with tf.variable_scope('adversarial_masking'): # adv_embedding = tf.get_variable('word/adv_embedding', # initializer=_create_embedding_matrix( # embeddings_index, # options.embedding_vocab_file), # trainable=True) # adv_embed_fn = lambda x: tf.nn.embedding_lookup(adv_embedding, x, max_norm=_MAX_NORM) # adv_question_embs = adv_embed_fn(question_token_ids) # adv_answer_embs = adv_embed_fn(answer_token_ids) # (adv_question_embs, # adv_answer_embs) = _reshape_answer_tensors(adv_question_embs, # adv_answer_embs) answer_shortcut_mask, temperature = self._adversarial_masking( tf.stop_gradient(question_embs), tf.stop_gradient(question_len), tf.stop_gradient(answer_embs), tf.stop_gradient(answer_len), is_training, predictions) # Ground both the question and the answer choices. question_object_features = _ground_tag_using_object_features( object_features, question_tags) answer_object_features = _ground_tag_using_object_features( object_features, answer_tags) # 0. Original prediction. outputs = self._predict_answer(question_embs, question_object_features, question_len, answer_embs, answer_object_features, answer_len, object_features, num_objects) with tf.variable_scope(tf.get_variable_scope(), reuse=True): # 1. Optimize the R2C with adversarial attack. outputs_adv_r2c = self._predict_answer( question_embs, question_object_features, question_len, tf.multiply(answer_embs, tf.stop_gradient(answer_shortcut_mask)), answer_object_features, answer_len, object_features, num_objects) # 2. Optimize the mask. outputs_adv_mask = self._predict_answer( tf.stop_gradient(question_embs), tf.stop_gradient(question_object_features), tf.stop_gradient(question_len), tf.multiply(tf.stop_gradient(answer_embs), answer_shortcut_mask), tf.stop_gradient(answer_object_features), tf.stop_gradient(answer_len), tf.stop_gradient(object_features), tf.stop_gradient(num_objects)) predictions.update({ 'temperature': 1.0 * temperature, 'image_id': inputs[InputFields.img_id], 'question': inputs[InputFields.question], 'answer_choices': inputs[InputFields.answer_choices], 'shortcut_mask': tf.reshape(answer_shortcut_mask, [batch_size, NUM_CHOICES, -1]), FIELD_ANSWER_PREDICTION_ORI: tf.reshape(outputs, [batch_size, NUM_CHOICES]), FIELD_ANSWER_PREDICTION_ADV_R2C: tf.reshape(outputs_adv_r2c, [batch_size, NUM_CHOICES]), FIELD_ANSWER_PREDICTION_ADV_MASK: tf.reshape(outputs_adv_mask, [batch_size, NUM_CHOICES]), }) return predictions
def _update_decoded_example(decoded_example, options): """Updates the decoded example, add size to the varlen feature. Args: decoded_example: A tensor dictionary keyed by name. options: An instance of reader_pb2.Reader. Returns: decoded_example: The same instance with content modified. """ token_to_id_func = token_to_id.TokenToIdLayer( options.vocab_file, options.out_of_vocabulary_token_id) # Number of objects. detection_boxes = decoded_example[InputFields.detection_boxes] detection_classes = decoded_example[InputFields.detection_classes] num_detections = tf.shape(detection_boxes)[0] # Object Fast-RCNN features. detection_features = decoded_example.pop( TFExampleFields.detection_features) detection_features = tf.reshape(detection_features, [-1, options.frcnn_feature_dims]) # Question length. question = decoded_example[InputFields.question] question_tag = decoded_example[InputFields.question_tag] question_len = tf.shape(question)[0] # Answer and rationale choices. answer_choices_list = [ decoded_example.pop(TFExampleFields.answer_choice + '_%i' % i) for i in range(1, 1 + NUM_CHOICES) ] answer_choices_tag_list = [ decoded_example.pop(TFExampleFields.answer_choice_tag + '_%i' % i) for i in range(1, 1 + NUM_CHOICES) ] (answer_choices, answer_choices_len) = _pad_sequences(answer_choices_list) (answer_choices_tag, _) = _pad_sequences(answer_choices_tag_list, -1) rationale_choices_list = [ decoded_example.pop(TFExampleFields.rationale_choice + '_%i' % i) for i in range(1, 1 + NUM_CHOICES) ] rationale_choices_tag_list = [ decoded_example.pop(TFExampleFields.rationale_choice_tag + '_%i' % i) for i in range(1, 1 + NUM_CHOICES) ] (rationale_choices, rationale_choices_len) = _pad_sequences(rationale_choices_list) (rationale_choices_tag, _) = _pad_sequences(rationale_choices_tag_list, -1) # Mixed question -> answer, question-answer -> rationale. answer_len = answer_choices_len[decoded_example[InputFields.answer_label]] answer = answer_choices[decoded_example[ InputFields.answer_label]][:answer_len] answer_tag = answer_choices_tag[decoded_example[ InputFields.answer_label]][:answer_len] mixed_answer_choices_list = [ tf.concat([question, ['[SEP]'], x], 0) for x in answer_choices_list ] mixed_answer_choices_tag_list = [ tf.concat([question_tag, [-1], x], 0) for x in answer_choices_tag_list ] (mixed_answer_choices, mixed_answer_choices_len) = _pad_sequences(mixed_answer_choices_list) (mixed_answer_choices_tag, _) = _pad_sequences(mixed_answer_choices_tag_list, pad=-1) mixed_rationale_choices_list = [ tf.concat([question, ['[SEP]'], answer, ['[SEP]'], x], 0) for x in rationale_choices_list ] mixed_rationale_choices_tag_list = [ tf.concat([question_tag, [-1], answer_tag, [-1], x], 0) for x in rationale_choices_tag_list ] (mixed_rationale_choices, mixed_rationale_choices_len ) = _pad_sequences(mixed_rationale_choices_list) (mixed_rationale_choices_tag, _) = _pad_sequences(mixed_rationale_choices_tag_list, pad=-1) decoded_example.update({ InputFields.num_detections: num_detections, InputFields.detection_classes: token_to_id_func(detection_classes), InputFields.detection_features: detection_features, InputFields.question: tf.tile(tf.expand_dims(token_to_id_func(question), 0), [NUM_CHOICES, 1]), InputFields.question_tag: tf.tile(tf.expand_dims(question_tag, 0), [NUM_CHOICES, 1]), InputFields.question_len: tf.tile(tf.expand_dims(question_len, 0), [NUM_CHOICES]), InputFields.answer_choices: token_to_id_func(answer_choices), InputFields.answer_choices_tag: answer_choices_tag, InputFields.answer_choices_len: answer_choices_len, InputFields.rationale_choices: token_to_id_func(rationale_choices), InputFields.rationale_choices_tag: rationale_choices_tag, InputFields.rationale_choices_len: rationale_choices_len, InputFields.mixed_answer_choices: token_to_id_func(mixed_answer_choices), InputFields.mixed_answer_choices_tag: mixed_answer_choices_tag, InputFields.mixed_answer_choices_len: mixed_answer_choices_len, InputFields.mixed_rationale_choices: token_to_id_func(mixed_rationale_choices), InputFields.mixed_rationale_choices_tag: mixed_rationale_choices_tag, InputFields.mixed_rationale_choices_len: mixed_rationale_choices_len, }) return decoded_example
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto token_to_id_layer = token_to_id.TokenToIdLayer( options.bert_vocab_file, options.bert_unk_token_id) bert_config = BertConfig.from_json_file(options.bert_config_file) slim_fc_scope = hyperparams.build_hyperparams(options.fc_hyperparams, is_training)() # Predict object embedding vectors. (num_objects, object_bboxes, object_labels, object_scores, object_features, max_num_objects) = _trim_to_max_num_objects( inputs[InputFields.num_detections], inputs[InputFields.detection_boxes], inputs[InputFields.detection_classes], inputs[InputFields.detection_scores], inputs[InputFields.detection_features], max_num_objects=options.max_num_objects) object_features = _predict_object_embeddings( object_features, bert_config.hidden_size, slim_fc_scope, keep_prob=options.dropout_keep_prob, is_training=is_training) # Gather text inputs. (answer_choices, answer_choices_tag, answer_choices_len) = (inputs[self._field_answer_choices], inputs[self._field_answer_choices_tag], inputs[self._field_answer_choices_len]) batch_size = answer_choices.shape[0] answer_choices_tag = _assign_invalid_tags(answer_choices_tag, max_num_objects) # Convert tokens into token ids. answer_choices_token_ids = token_to_id_layer(answer_choices) answer_choices_token_ids = tf.reshape(answer_choices_token_ids, [batch_size * NUM_CHOICES, -1]) answer_choices_mask = tf.sequence_mask( answer_choices_len, maxlen=tf.shape(answer_choices)[-1]) answer_choices_mask = tf.reshape(answer_choices_mask, [batch_size * NUM_CHOICES, -1]) # Create tag features sequence. answer_choices_tag = tf.reshape(answer_choices_tag, [batch_size * NUM_CHOICES, -1]) answer_choices_tag_embeddings = _ground_tag_using_object_features( object_features, answer_choices_tag) (tiled_object_masks, tiled_object_ids, tiled_object_features) = _tile_objects( num_objects, token_to_id_layer(object_labels), object_features) # Create Bert model. input_ids = tf.concat([answer_choices_token_ids, tiled_object_ids], -1) input_tag_embeddings = tf.concat( [answer_choices_tag_embeddings, tiled_object_features], 1) input_mask = tf.concat([answer_choices_mask, tiled_object_masks], -1) output = self._bert_model( input_ids, input_tag_embeddings, input_mask, bert_config, bert_checkpoint_file=options.bert_checkpoint_file, is_training=is_training) # Classification layer. with slim.arg_scope(slim_fc_scope): output = slim.fully_connected(output, num_outputs=1, activation_fn=None, scope='logits') output = tf.reshape(output, [batch_size, NUM_CHOICES]) return {FIELD_ANSWER_PREDICTION: output}
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ is_training = self._is_training options = self._model_proto (image, height, width, num_objects, object_bboxes, object_labels, object_scores, answer_choices, answer_choices_len, answer_label) = (inputs[InputFields.img_data], inputs[InputFields.img_height], inputs[InputFields.img_width], inputs[InputFields.num_objects], inputs[InputFields.object_bboxes], inputs[InputFields.object_labels], inputs[InputFields.object_scores], inputs[InputFields.answer_choices_with_question], inputs[InputFields.answer_choices_with_question_len], inputs[InputFields.answer_label]) # Visualize image and object bboxes. batch_size = image.shape[0] image_batch_shape = tf.shape(image) object_bboxes = _to_batch_coordinates(object_bboxes, height, width, image_batch_shape[1], image_batch_shape[2]) image_with_boxes = visualization.draw_bounding_boxes_on_image_tensors( image, num_objects, object_bboxes, object_labels, object_scores) tf.summary.image('vcr/detection', image_with_boxes, max_outputs=10) # Extract FRCNN feature. frcnn_features = fast_rcnn.FastRCNN(tf.cast(image, tf.float32), object_bboxes, options=options.fast_rcnn_config, is_training=is_training) object_masks = tf.sequence_mask(num_objects, tf.shape(object_bboxes)[1], dtype=tf.float32) image_feature = masked_ops.masked_avg_nd(frcnn_features, object_masks, dim=1) # Convert tokens into token ids. token_to_id_layer = token_to_id.TokenToIdLayer( options.bert_vocab_file, options.bert_unk_token_id) answer_choices_token_ids = token_to_id_layer(answer_choices) answer_choices_token_ids_reshaped = tf.reshape( answer_choices_token_ids, [batch_size * NUM_CHOICES, -1]) answer_choices_mask = tf.sequence_mask( answer_choices_len, maxlen=tf.shape(answer_choices)[-1]) answer_choices_mask_reshaped = tf.reshape( answer_choices_mask, [batch_size * NUM_CHOICES, -1]) # Bert prediction. bert_config = BertConfig.from_json_file(options.bert_config_file) bert_model = BertModel(bert_config, is_training, input_ids=answer_choices_token_ids_reshaped, input_mask=answer_choices_mask_reshaped) answer_choices_cls_feature_reshaped = bert_model.get_pooled_output() answer_choices_cls_feature = tf.reshape( answer_choices_cls_feature_reshaped, [batch_size, NUM_CHOICES, -1]) assignment_map, _ = get_assignment_map_from_checkpoint( tf.global_variables(), options.bert_checkpoint_file) # Fuse image feature. image_feature_tiled = tf.tile(image_feature, [1, NUM_CHOICES, 1]) answer_choices_cls_feature = tf.concat( [answer_choices_cls_feature, image_feature_tiled], -1) # Classification layer. output = tf.compat.v1.layers.dense(answer_choices_cls_feature, units=1, activation=None) output = tf.squeeze(output, axis=-1) return {FIELD_ANSWER_PREDICTION: output}