def build(self): """ build network architecture and loss """ """ Visual features """ with tf.device('/cpu:0'): def load_feature(image_idx): selected_features = np.take(self.features, image_idx, axis=0) return selected_features V_ft = tf.py_func(load_feature, inp=[self.batch['image_idx']], Tout=tf.float32, name='sample_features') V_ft.set_shape([None, self.max_box_num, self.vfeat_dim]) num_V_ft = tf.gather(self.num_boxes, self.batch['image_idx'], name='gather_num_V_ft', axis=0) self.mid_result['num_V_ft'] = num_V_ft normal_boxes = tf.gather(self.normal_boxes, self.batch['image_idx'], name='gather_normal_boxes', axis=0) self.mid_result['normal_boxes'] = normal_boxes log.warning('v_linear_v') v_linear_v = modules.fc_layer(V_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='v_linear_v') """ Encode question """ q_embed = tf.nn.embedding_lookup(self.glove_map, self.batch['q_intseq']) # [bs, L_DIM] q_L_ft = modules.encode_L(q_embed, self.batch['q_intseq_len'], L_DIM, cell_type='GRU') self.heavy_output['condition'] = q_L_ft # [bs, V_DIM} log.warning('q_linear_v') q_linear_v = modules.fc_layer(q_L_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_v') self.mid_result['q_linear_v'] = q_linear_v """ Perform attention """ att_score = modules.hadamard_attention(v_linear_v, num_V_ft, q_linear_v, use_ln=False, is_train=self.is_train) self.output['att_score'] = att_score self.mid_result['att_score'] = att_score pooled_V_ft = modules.attention_pooling(V_ft, att_score) self.mid_result['pooled_V_ft'] = pooled_V_ft log.warning('pooled_linear_l') pooled_linear_l = modules.fc_layer(pooled_V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='pooled_linear_l') self.mid_result['pooled_linear_l'] = pooled_linear_l ############################## # 1. Fixed vlmap classifier ############################## """ Answer classification """ log.warning('q_linear_l') l_linear_l = modules.fc_layer(q_L_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_l') self.mid_result['l_linear_l'] = l_linear_l joint = modules.fc_layer(pooled_linear_l * l_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_fc') joint = tf.nn.dropout(joint, 0.5) self.mid_result['joint'] = joint logit = modules.WordWeightAnswer(joint, self.answer_dict, self.word_weight_dir, use_bias=True, is_training=self.is_train, default_bias=-100.0, scope='WordWeightAnswer') min_logit = tf.tile(tf.reduce_min(logit, axis=1, keepdims=True), [1, self.num_answer]) logit = logit * self.answer_exist_mask + min_logit * self.answer_non_exist_mask ########################## # 2. Fine tuned vlmap ########################## log.warning('tuned_q_linear_l') tuned_l_linear_l = modules.fc_layer(q_L_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='tuned_q_linear_l') self.mid_result['tuned_l_linear_l'] = tuned_l_linear_l tuned_joint = modules.fc_layer(pooled_linear_l * tuned_l_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='tuned_joint_fc') tuned_joint = tf.nn.dropout(tuned_joint, 0.5) self.mid_result['tuned_joint'] = tuned_joint tuned_logit = modules.fc_layer(joint, self.num_answer, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='TunedWordWeightAnswer') ########################## # 3. Combine logits ########################## self.output['logit'] = logit + tuned_logit self.mid_result['logit'] = logit + tuned_logit """ Compute loss and accuracy """ with tf.name_scope('loss'): answer_target = self.batch['answer_target'] untuned_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=answer_target, logits=logit) tuned_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=answer_target, logits=logit + tuned_logit) loss = untuned_loss + tuned_loss train_loss = tf.reduce_mean( tf.reduce_sum(loss * self.train_answer_mask, axis=-1)) report_loss = tf.reduce_mean(tf.reduce_sum(loss, axis=-1)) pred = tf.cast(tf.argmax(logit + tuned_logit, axis=-1), dtype=tf.int32) one_hot_pred = tf.one_hot(pred, depth=self.num_answer, dtype=tf.float32) self.output['pred'] = pred all_score = tf.reduce_sum(one_hot_pred * answer_target, axis=-1) max_train_score = tf.reduce_max(answer_target * self.train_answer_mask, axis=-1) test_obj_score = tf.reduce_sum(one_hot_pred * answer_target * self.test_answer_mask * self.obj_answer_mask, axis=-1) test_obj_max_score = tf.reduce_max( answer_target * self.test_answer_mask * self.obj_answer_mask, axis=-1) test_attr_score = tf.reduce_sum(one_hot_pred * answer_target * self.test_answer_mask * self.attr_answer_mask, axis=-1) test_attr_max_score = tf.reduce_max( answer_target * self.test_answer_mask * self.attr_answer_mask, axis=-1) self.output['test_obj_score'] = test_obj_score self.output['test_obj_max_score'] = test_obj_max_score self.output['test_attr_score'] = test_attr_score self.output['test_attr_max_score'] = test_attr_max_score self.output['all_score'] = all_score self.output['max_train_score'] = max_train_score acc = tf.reduce_mean(all_score) exist_acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target * self.answer_exist_mask, axis=-1)) test_acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target * self.test_answer_mask, axis=-1)) test_obj_acc = tf.reduce_mean(test_obj_score) test_attr_acc = tf.reduce_mean(test_attr_score) train_exist_acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target * self.answer_exist_mask * self.train_answer_mask, axis=-1)) max_exist_answer_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.answer_exist_mask, axis=-1)) max_train_exist_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.answer_exist_mask * self.train_answer_mask, axis=-1)) test_obj_max_acc = tf.reduce_mean(test_obj_max_score) test_attr_max_acc = tf.reduce_mean(test_attr_max_score) test_max_answer_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.test_answer_mask, axis=-1)) test_max_exist_answer_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.answer_exist_mask * self.test_answer_mask, axis=-1)) normal_test_obj_acc = tf.where(tf.equal(test_obj_max_acc, 0), test_obj_max_acc, test_obj_acc / test_obj_max_acc) normal_test_attr_acc = tf.where(tf.equal(test_attr_max_acc, 0), test_attr_max_acc, test_attr_acc / test_attr_max_acc) normal_train_exist_acc = tf.where( tf.equal(max_train_exist_acc, 0), max_train_exist_acc, train_exist_acc / max_train_exist_acc) normal_exist_acc = tf.where(tf.equal(max_exist_answer_acc, 0), max_exist_answer_acc, exist_acc / max_exist_answer_acc) normal_test_acc = tf.where(tf.equal(test_max_answer_acc, 0), test_max_answer_acc, test_acc / test_max_answer_acc) self.mid_result['pred'] = pred self.losses['answer'] = train_loss self.report['answer_train_loss'] = train_loss self.report['answer_report_loss'] = report_loss self.report['answer_acc'] = acc self.report['exist_acc'] = exist_acc self.report['test_acc'] = test_acc self.report['normal_test_acc'] = normal_test_acc self.report['normal_test_object_acc'] = normal_test_obj_acc self.report['normal_test_attribute_acc'] = normal_test_attr_acc self.report['normal_exist_acc'] = normal_exist_acc self.report['normal_train_exist_acc'] = normal_train_exist_acc self.report['max_exist_acc'] = max_exist_answer_acc self.report['test_max_acc'] = test_max_answer_acc self.report['test_max_exist_acc'] = test_max_exist_answer_acc """ Prepare image summary """ """ with tf.name_scope('prepare_summary'): self.vis_image['image_attention_qa'] = self.visualize_vqa_result( self.batch['image_id'], self.mid_result['normal_boxes'], self.mid_result['num_V_ft'], self.mid_result['att_score'], self.batch['q_intseq'], self.batch['q_intseq_len'], self.batch['answer_target'], self.mid_result['pred'], max_batch_num=20, line_width=2) """ self.loss = 0 for key, loss in self.losses.items(): self.loss = self.loss + loss # scalar summary for key, val in self.report.items(): tf.summary.scalar('train/{}'.format(key), val, collections=['heavy_train', 'train']) tf.summary.scalar('val/{}'.format(key), val, collections=['heavy_val', 'val']) tf.summary.scalar('testval/{}'.format(key), val, collections=['heavy_testval', 'testval']) # image summary for key, val in self.vis_image.items(): tf.summary.image('train-{}'.format(key), val, max_outputs=10, collections=['heavy_train']) tf.summary.image('val-{}'.format(key), val, max_outputs=10, collections=['heavy_val']) tf.summary.image('testval-{}'.format(key), val, max_outputs=10, collections=['heavy_testval']) return self.loss
def build_attribute_attention(self): """ attribute_attention """ num_V_ft = self.batch['num_boxes'] v_linear_v = self.mid_result['v_linear_v'] w_embed = tf.nn.embedding_lookup(self.v_word_map, self.batch['attr_att/word_tokens']) w_L_ft = modules.fc_layer(w_embed, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='v_word_fc') w_len = self.batch['attr_att/word_tokens_len'] mask = tf.sequence_mask( # [bs, #proposal, len] w_len, maxlen=tf.shape(w_L_ft)[-2], dtype=tf.float32) pooled_w_L_ft = tf.reduce_sum(w_L_ft * tf.expand_dims(mask, axis=-1), axis=-2) pooled_w_L_ft = pooled_w_L_ft / \ tf.expand_dims(tf.to_float(w_len), axis=-1) l_linear_v = modules.fc_layer(pooled_w_L_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_v') tile_v_linear_v = tf.tile(tf.expand_dims(v_linear_v, axis=1), [1, self.data_cfg.n_attr_att, 1, 1]) flat_tile_v_linear_v = tf.reshape( tile_v_linear_v, [-1, self.data_cfg.max_box_num, V_DIM]) tile_num_V_ft = tf.tile(tf.expand_dims(num_V_ft, axis=1), [1, self.data_cfg.n_attr_att]) flat_tile_num_V_ft = tf.reshape(tile_num_V_ft, [-1]) flat_l_linear_v = tf.reshape(l_linear_v, [-1, V_DIM]) # flat_att_logit: [bs * #attr, num_proposal] flat_att_logit = modules.hadamard_attention(flat_tile_v_linear_v, flat_tile_num_V_ft, flat_l_linear_v, use_ln=False, is_train=self.is_train, normalizer=None) n_entry = self.data_cfg.n_attr_att n_proposal = self.data_cfg.max_box_num logit = tf.reshape(flat_att_logit, [-1, n_entry, n_proposal]) with tf.name_scope('loss/attr_attend'): multilabel_gt = tf.to_float( tf.greater(self.batch['attr_att/att_scores'], 0.5)) num_valid_entry = self.batch['attr_att/num'] valid_mask = tf.sequence_mask(num_valid_entry, maxlen=self.data_cfg.n_attr_att, dtype=tf.float32) loss, acc, recall, precision, top_1_prec, top_k_recall = \ self.binary_classification_loss(logit, multilabel_gt, valid_mask, depth=self.data_cfg.max_box_num) self.losses['attr_att'] = loss self.report['attr_att_loss'] = loss self.report['attr_att_acc'] = acc self.report['attr_att_recall'] = recall self.report['attr_att_precision'] = precision self.report['attr_att_top_1_prec'] = top_1_prec self.report['attr_att_top_{}_recall'.format(TOP_K)] = top_k_recall
def build_object_V_ft(self): V_ft = self.batch['image_ft'] # [bs, #proposal, #feat_dim] V_ft = tf.expand_dims(V_ft, axis=1) # [bs, 1, #proposal, #feat_dim] V_ft = tf.tile(V_ft, [1, self.data_cfg.n_obj_bf, 1, 1 ]) # [bs, #obj, #proposal, #feat_dim] V_ft = tf.reshape( V_ft, [-1, self.data_cfg.max_box_num, self.data_cfg.vfeat_dim ]) # [bs * #obj, #proposal, #feat_dim] spat_ft = self.batch['spatial_ft'] spat_ft = tf.expand_dims(spat_ft, axis=1) spat_ft = tf.tile(spat_ft, [1, self.data_cfg.n_obj_bf, 1, 1]) spat_ft = tf.reshape(spat_ft, [-1, self.data_cfg.max_box_num, 6]) num_V_ft = self.batch['num_boxes'] # [bs] num_V_ft = tf.expand_dims(num_V_ft, axis=1) # [bs, 1] num_V_ft = tf.tile(num_V_ft, [1, self.data_cfg.n_obj_bf]) # [bs, #obj] num_V_ft = tf.reshape(num_V_ft, [-1]) # [bs * #obj] key_spat_ft = self.batch['obj_blank_fill/normal_boxes'] key_spat_ft = tf.concat([ key_spat_ft, tf.expand_dims(key_spat_ft[:, :, 2] - key_spat_ft[:, :, 0], axis=-1), tf.expand_dims(key_spat_ft[:, :, 3] - key_spat_ft[:, :, 1], axis=-1) ], axis=-1) v_linear_v = modules.fc_layer( # [bs * #obj, #proposal, V_DIM] spat_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='spat_v_linear_v') q_linear_v = modules.fc_layer( # [bs, #obj, V_DIM] key_spat_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='spat_q_linear_v') flat_q_linear_v = tf.reshape(q_linear_v, [-1, V_DIM]) # [bs * #obj, V_DIM] att_score = modules.hadamard_attention( # [bs * #obj, len] v_linear_v, num_V_ft, flat_q_linear_v, use_ln=False, is_train=self.is_train, scope='spat_att') flat_pooled_V_ft = modules.attention_pooling( V_ft, att_score) # [bs * #obj, vfeat_dim] pooled_V_ft = tf.reshape( flat_pooled_V_ft, [-1, self.data_cfg.n_obj_bf, self.data_cfg.vfeat_dim]) self.mid_result['object_pooled_V_ft'] = pooled_V_ft
def build(self): """ build network architecture and loss """ """ Visual features """ with tf.device('/cpu:0'): def load_feature(image_idx): selected_features = np.take(self.features, image_idx, axis=0) return selected_features V_ft = tf.py_func(load_feature, inp=[self.batch['image_idx']], Tout=tf.float32, name='sample_features') V_ft.set_shape([None, self.max_box_num, self.vfeat_dim]) num_V_ft = tf.gather(self.num_boxes, self.batch['image_idx'], name='gather_num_V_ft', axis=0) self.mid_result['num_V_ft'] = num_V_ft normal_boxes = tf.gather(self.normal_boxes, self.batch['image_idx'], name='gather_normal_boxes', axis=0) self.mid_result['normal_boxes'] = normal_boxes log.warning('v_linear_v') v_linear_v = modules.fc_layer(V_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='v_linear_v') """ Encode question """ q_embed = tf.nn.embedding_lookup(self.glove_map, self.batch['q_intseq']) # [bs, L_DIM] q_L_ft = modules.encode_L(q_embed, self.batch['q_intseq_len'], L_DIM, cell_type='GRU') self.heavy_output['condition'] = q_L_ft # [bs, V_DIM} log.warning('q_linear_v') q_linear_v = modules.fc_layer(q_L_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_v') """ Perform attention """ att_score = modules.hadamard_attention(v_linear_v, num_V_ft, q_linear_v, use_ln=False, is_train=self.is_train) self.output['att_score'] = att_score self.mid_result['att_score'] = att_score pooled_V_ft = modules.attention_pooling(V_ft, att_score) """ Answer classification """ # perform two layer feature encoding and predict output with tf.variable_scope('reasoning') as scope: log.warning(scope.name) # [bs, L_DIM] log.warning('pooled_linear_l') pooled_linear_l = modules.fc_layer(pooled_V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='pooled_linear_l') log.warning('q_linear_l') q_linear_l = modules.fc_layer(q_L_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_l') joint = modules.fc_layer(pooled_linear_l * q_linear_l, 2048, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_fc') joint = tf.nn.dropout(joint, 0.5) joint2 = modules.fc_layer(joint, 300, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='classifier') output_glove = modules.LearnGloVe(self.answer_dict, learnable=False, oov_mean_initialize=True) logit = tf.matmul(joint2, output_glove) self.output['logit'] = logit """ Compute loss and accuracy """ with tf.name_scope('loss'): answer_target = self.batch['answer_target'] loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=answer_target, logits=logit) train_loss = tf.reduce_mean( tf.reduce_sum(loss * self.train_answer_mask, axis=-1)) report_loss = tf.reduce_mean(tf.reduce_sum(loss, axis=-1)) pred = tf.cast(tf.argmax(logit, axis=-1), dtype=tf.int32) one_hot_pred = tf.one_hot(pred, depth=self.num_answer, dtype=tf.float32) self.output['pred'] = pred all_score = tf.reduce_sum(one_hot_pred * answer_target, axis=-1) max_train_score = tf.reduce_max(answer_target * self.train_answer_mask, axis=-1) test_obj_score = tf.reduce_sum(one_hot_pred * answer_target * self.test_answer_mask * self.obj_answer_mask, axis=-1) test_obj_max_score = tf.reduce_max( answer_target * self.test_answer_mask * self.obj_answer_mask, axis=-1) test_attr_score = tf.reduce_sum(one_hot_pred * answer_target * self.test_answer_mask * self.attr_answer_mask, axis=-1) test_attr_max_score = tf.reduce_max( answer_target * self.test_answer_mask * self.attr_answer_mask, axis=-1) self.output['test_obj_score'] = test_obj_score self.output['test_obj_max_score'] = test_obj_max_score self.output['test_attr_score'] = test_attr_score self.output['test_attr_max_score'] = test_attr_max_score self.output['all_score'] = all_score self.output['max_train_score'] = max_train_score acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target, axis=-1)) exist_acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target * self.answer_exist_mask, axis=-1)) test_acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target * self.test_answer_mask, axis=-1)) test_obj_acc = tf.reduce_mean(test_obj_score) test_attr_acc = tf.reduce_mean(test_attr_score) train_exist_acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target * self.answer_exist_mask * self.train_answer_mask, axis=-1)) max_exist_answer_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.answer_exist_mask, axis=-1)) max_train_exist_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.answer_exist_mask * self.train_answer_mask, axis=-1)) test_obj_max_acc = tf.reduce_mean(test_obj_max_score) test_attr_max_acc = tf.reduce_mean(test_attr_max_score) test_max_answer_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.test_answer_mask, axis=-1)) test_max_exist_answer_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.answer_exist_mask * self.test_answer_mask, axis=-1)) normal_test_obj_acc = tf.where(tf.equal(test_obj_max_acc, 0), test_obj_max_acc, test_obj_acc / test_obj_max_acc) normal_test_attr_acc = tf.where(tf.equal(test_attr_max_acc, 0), test_attr_max_acc, test_attr_acc / test_attr_max_acc) normal_train_exist_acc = tf.where( tf.equal(max_train_exist_acc, 0), max_train_exist_acc, train_exist_acc / max_train_exist_acc) normal_exist_acc = tf.where(tf.equal(max_exist_answer_acc, 0), max_exist_answer_acc, exist_acc / max_exist_answer_acc) normal_test_acc = tf.where(tf.equal(test_max_answer_acc, 0), test_max_answer_acc, test_acc / test_max_answer_acc) self.mid_result['pred'] = pred self.losses['answer'] = train_loss self.report['answer_train_loss'] = train_loss self.report['answer_report_loss'] = report_loss self.report['answer_acc'] = acc self.report['exist_acc'] = exist_acc self.report['test_acc'] = test_acc self.report['normal_test_acc'] = normal_test_acc self.report['normal_test_object_acc'] = normal_test_obj_acc self.report['normal_test_attribute_acc'] = normal_test_attr_acc self.report['normal_exist_acc'] = normal_exist_acc self.report['normal_train_exist_acc'] = normal_train_exist_acc self.report['max_exist_acc'] = max_exist_answer_acc self.report['test_max_acc'] = test_max_answer_acc self.report['test_max_exist_acc'] = test_max_exist_answer_acc """ Prepare image summary """ self.loss = self.losses['answer'] # scalar summary for key, val in self.report.items(): tf.summary.scalar('train/{}'.format(key), val, collections=['heavy_train', 'train']) tf.summary.scalar('val/{}'.format(key), val, collections=['heavy_val', 'val']) tf.summary.scalar('testval/{}'.format(key), val, collections=['heavy_testval', 'testval']) # image summary for key, val in self.vis_image.items(): tf.summary.image('train-{}'.format(key), val, max_outputs=10, collections=['heavy_train']) tf.summary.image('val-{}'.format(key), val, max_outputs=10, collections=['heavy_val']) tf.summary.image('testval-{}'.format(key), val, max_outputs=10, collections=['heavy_testval']) return self.loss