def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] image_feat = common_layers.flatten4d3d(image_feat) # image feature self attention # image_feat = tf.nn.dropout( # image_feat, keep_prob=1.-hp.layer_prepostprocess_dropout) # image_feat = image_feat - tf.reduce_mean( # image_feat, axis=-1, keepdims=True) # image_feat = tf.nn.l2_normalize(image_feat, -1) # utils.collect_named_outputs("norms", "image_feat_after_l2", # tf.norm(image_feat, axis=-1)) image_feat = tf.nn.dropout(image_feat, keep_prob=1.-hp.dropout) image_feat = image_encoder(image_feat, hp) utils.collect_named_outputs("norms", "image_feat_encoded", tf.norm(image_feat, axis=-1)) image_feat = common_layers.l2_norm(image_feat) utils.collect_named_outputs("norms", "image_feat_encoded_l2", tf.norm(image_feat, axis=-1)) query = question_encoder(features["question"], hp) utils.collect_named_outputs("norms", "query", tf.norm(query, axis=-1)) image_ave = attn(image_feat, query, hp) utils.collect_named_outputs("norms", "image_ave", tf.norm(image_ave, axis=-1)) image_question = tf.concat([image_ave, query], axis=1) utils.collect_named_outputs("norms", "image_question", tf.norm(image_question, axis=-1)) image_question = tf.nn.dropout(image_question, 1. - hp.dropout) output = mlp(image_question, hp) utils.collect_named_outputs("norms", "output", tf.norm(output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(tf.expand_dims(output, axis=1), axis=2)
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] image_feat = common_layers.flatten4d3d(image_feat) image_hidden_size = hp.hidden_size image_feat = common_layers.dense(image_feat, image_hidden_size) utils.collect_named_outputs("norms", "image_feat_after_proj", tf.norm(image_feat, axis=-1)) question = common_layers.flatten4d3d(features["question"]) utils.collect_named_outputs("norms", "question_embedding", tf.norm(question, axis=-1)) (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = prepare_image_question_encoder( image_feat, question, hp) encoder_input = tf.nn.dropout(encoder_input, keep_prob=1. - hp.layer_prepostprocess_dropout) encoder_output = image_question_encoder(encoder_input, encoder_self_attention_bias, hp) utils.collect_named_outputs("norms", "encoder_output", tf.norm(encoder_output, axis=-1)) # scale query by sqrt(hidden_size) query = tf.get_variable("query", [hp.hidden_size]) * hp.hidden_size**0.5 query = tf.expand_dims(tf.expand_dims(query, axis=0), axis=0) batch_size = common_layers.shape_list(encoder_input)[0] query = tf.tile(query, [batch_size, 1, 1]) query = tf.nn.dropout(query, keep_prob=1. - hp.layer_prepostprocess_dropout) decoder_output = decoder(query, encoder_output, None, encoder_decoder_attention_bias, hp) utils.collect_named_outputs("norms", "decoder_output", tf.norm(decoder_output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(decoder_output, axis=1)
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] image_feat = common_layers.flatten4d3d(image_feat) image_feat = common_layers.dense(image_feat, hp.hidden_size) utils.collect_named_outputs("norms", "image_feat_after_proj", tf.norm(image_feat, axis=-1)) question = common_layers.flatten4d3d(features["question"]) utils.collect_named_outputs("norms", "question_embedding", tf.norm(question, axis=-1)) (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = prepare_image_question_encoder( image_feat, question, hp) encoder_input = tf.nn.dropout( encoder_input, keep_prob=1.-hp.layer_prepostprocess_dropout) encoder_output, _ = recurrent_transformer_decoder( encoder_input, None, encoder_self_attention_bias, None, hp, name="encoder") utils.collect_named_outputs( "norms", "encoder_output", tf.norm(encoder_output, axis=-1)) # scale query by sqrt(hidden_size) query = tf.get_variable("query", [hp.hidden_size]) * hp.hidden_size **0.5 query = tf.expand_dims(tf.expand_dims(query, axis=0), axis=0) batch_size = common_layers.shape_list(encoder_input)[0] query = tf.tile(query, [batch_size, 1, 1]) query = tf.nn.dropout( query, keep_prob=1.-hp.layer_prepostprocess_dropout) decoder_output, _ = recurrent_transformer_decoder( query, encoder_output, None, encoder_decoder_attention_bias, hp, name="decoder") utils.collect_named_outputs("norms", "decoder_output", tf.norm(decoder_output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(decoder_output, axis=1)
def body(self, features): hp = self.hparams model_fn = resnet_v1_152 if hp.image_model_fn != "resnet_v1_152": model_fn = eval(hp.image_model_fn) # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=model_fn, trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] if hp.image_feat_size: image_feat = common_layers.dense(image_feat, hp.image_feat_size) # apply layer normalization and dropout on image_feature utils.collect_named_outputs("norms", "image_feat_before_l2", tf.norm(image_feat, axis=-1)) image_feat = common_layers.l2_norm(image_feat) utils.collect_named_outputs("norms", "image_feat_after_l2", tf.norm(image_feat, axis=-1)) image_feat = tf.nn.dropout(image_feat, keep_prob=1.-hp.dropout) query = question_encoder(features["question"], hp) utils.collect_named_outputs("norms", "query", tf.norm(query, axis=-1)) image_ave = attn(image_feat, query, hp) utils.collect_named_outputs("norms", "image_ave", tf.norm(image_ave, axis=-1)) image_question = tf.concat([image_ave, query], axis=1) utils.collect_named_outputs("norms", "image_question", tf.norm(image_question, axis=-1)) image_question = tf.nn.dropout(image_question, 1. - hp.dropout) output = mlp(image_question, hp) utils.collect_named_outputs("norms", "output", tf.norm(output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(tf.expand_dims(output, axis=1), axis=2)
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] if hp.image_feat_size: image_feat = common_layers.dense(image_feat, hp.image_feat_size) # apply layer normalization and dropout on image_feature utils.collect_named_outputs("norms", "image_feat_before_l2", tf.norm(image_feat, axis=-1)) image_feat = common_layers.l2_norm(image_feat) utils.collect_named_outputs("norms", "image_feat_after_l2", tf.norm(image_feat, axis=-1)) image_feat = tf.nn.dropout(image_feat, keep_prob=1.-hp.dropout) query = question_encoder(features["question"], hp) utils.collect_named_outputs("norms", "query", tf.norm(query, axis=-1)) image_ave = attn(image_feat, query, hp) utils.collect_named_outputs("norms", "image_ave", tf.norm(image_ave, axis=-1)) image_question = tf.concat([image_ave, query], axis=1) utils.collect_named_outputs("norms", "image_question", tf.norm(image_question, axis=-1)) image_question = tf.nn.dropout(image_question, 1. - hp.dropout) output = mlp(image_question, hp) utils.collect_named_outputs("norms", "output", tf.norm(output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(tf.expand_dims(output, axis=1), axis=2)
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] image_feat = common_layers.flatten4d3d(image_feat) image_hidden_size = hp.image_hidden_size or hp.hidden_size if hp.image_feat_preprocess_proj: image_feat = common_layers.dense(image_feat, image_hidden_size) utils.collect_named_outputs("norms", "image_feat_after_proj", tf.norm(image_feat, axis=-1)) else: assert image_hidden_size == 2048 image_feat = tf.nn.dropout( image_feat, keep_prob=1.-hp.layer_prepostprocess_dropout) if hp.image_feat_encode: image_feat = image_encoder(image_feat, hp) utils.collect_named_outputs("norms", "image_feat_encoded", tf.norm(image_feat, axis=-1)) else: image_feat = common_layers.layer_norm(image_feat) utils.collect_named_outputs("norms", "image_feat_after_layer", tf.norm(image_feat, axis=-1)) question = common_layers.flatten4d3d(features["question"]) utils.collect_named_outputs("norms", "question_embedding", tf.norm(question, axis=-1)) question, question_self_attention_bias = prepare_question_encoder( question, hp) question = tf.nn.dropout( question, keep_prob=1.-hp.layer_prepostprocess_dropout) query = question_encoder(question, question_self_attention_bias, hp) utils.collect_named_outputs( "norms", "query_encode", tf.norm(query, axis=-1)) query = (query + tf.expand_dims( tf.squeeze(question_self_attention_bias, [1, 2]), axis=2)) query = tf.reduce_max(query, axis=1) utils.collect_named_outputs( "norms", "query_maxpool", tf.norm(query, axis=-1)) # query = common_layers.l2_norm(query) # utils.collect_named_outputs("norms", "query_after_l2", # tf.norm(query, axis=-1)) image_ave = attn(image_feat, query, hp) utils.collect_named_outputs("norms", "image_ave", tf.norm(image_ave, axis=-1)) if hp.multimodal_combine == "concat": image_question = tf.concat([image_ave, query], axis=1) elif hp.multimodal_combine == "sum": image_question = image_ave + query elif hp.multimodal_combine == "product": image_question = image_ave * query utils.collect_named_outputs("norms", "image_question", tf.norm(image_question, axis=-1)) image_question = tf.nn.dropout(image_question, 1. - hp.dropout) output = mlp(image_question, hp) utils.collect_named_outputs("norms", "output", tf.norm(output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(tf.expand_dims(output, axis=1), axis=2)
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] image_feat = common_layers.flatten4d3d(image_feat) image_hidden_size = hp.image_hidden_size or hp.hidden_size if hp.image_feat_preprocess_proj: image_feat = common_layers.dense(image_feat, image_hidden_size) utils.collect_named_outputs("norms", "image_feat_after_proj", tf.norm(image_feat, axis=-1)) else: assert image_hidden_size == 2048 image_feat = tf.nn.dropout(image_feat, keep_prob=1. - hp.layer_prepostprocess_dropout) if hp.image_feat_encode: image_feat = image_encoder(image_feat, hp) utils.collect_named_outputs("norms", "image_feat_encoded", tf.norm(image_feat, axis=-1)) else: image_feat = common_layers.layer_norm(image_feat) utils.collect_named_outputs("norms", "image_feat_after_layer", tf.norm(image_feat, axis=-1)) question = common_layers.flatten4d3d(features["question"]) utils.collect_named_outputs("norms", "question_embedding", tf.norm(question, axis=-1)) question, question_self_attention_bias = prepare_question_encoder( question, hp) question = tf.nn.dropout(question, keep_prob=1. - hp.layer_prepostprocess_dropout) query = question_encoder(question, question_self_attention_bias, hp) utils.collect_named_outputs("norms", "query_encode", tf.norm(query, axis=-1)) query = (query + tf.expand_dims( tf.squeeze(question_self_attention_bias, [1, 2]), axis=2)) query = tf.reduce_max(query, axis=1) utils.collect_named_outputs("norms", "query_maxpool", tf.norm(query, axis=-1)) # query = common_layers.l2_norm(query) # utils.collect_named_outputs("norms", "query_after_l2", # tf.norm(query, axis=-1)) image_ave = attn(image_feat, query, hp) utils.collect_named_outputs("norms", "image_ave", tf.norm(image_ave, axis=-1)) if hp.multimodal_combine == "concat": image_question = tf.concat([image_ave, query], axis=1) elif hp.multimodal_combine == "sum": image_question = image_ave + query elif hp.multimodal_combine == "product": image_question = image_ave * query utils.collect_named_outputs("norms", "image_question", tf.norm(image_question, axis=-1)) image_question = tf.nn.dropout(image_question, 1. - hp.dropout) output = mlp(image_question, hp) utils.collect_named_outputs("norms", "output", tf.norm(output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(tf.expand_dims(output, axis=1), axis=2)