Example #1
0
    def __init__(self, batch, config, is_train=True):
        self.batch = batch
        self.config = config
        self.data_cfg = config.data_cfg
        self.data_dir = config.data_dir
        self.is_train = is_train

        self.losses = {}
        self.report = {}
        self.mid_result = {}
        self.vis_image = {}

        vocab_path = os.path.join(self.data_dir, 'vocab.pkl')
        self.vocab = cPickle.load(open(vocab_path, 'rb'))

        answer_dict_path = os.path.join(self.data_dir, 'answer_dict.pkl')
        self.answer_dict = cPickle.load(open(answer_dict_path, 'rb'))
        self.num_answer = len(self.answer_dict['vocab'])

        ws_dict_path = os.path.join(
            self.data_dir,
            'wordset_dict5_depth{}.pkl'.format(int(config.expand_depth)))
        self.ws_dict = cPickle.load(open(ws_dict_path, 'rb'))
        self.num_ws = len(self.ws_dict['vocab'])

        self.wordset_map = modules.learn_embedding_map(self.ws_dict,
                                                       scope='wordset_map')
        self.v_word_map = modules.LearnGloVe(self.vocab, scope='V_GloVe')
        self.l_word_map = modules.LearnGloVe(self.vocab, scope='L_GloVe')
        self.l_answer_word_map = modules.LearnAnswerGloVe(self.answer_dict)

        self.build()
Example #2
0
    def __init__(self, batch, config, is_train=True, image_features=None):
        self.batch = batch
        self.config = config
        self.image_dir = config.image_dir
        self.is_train = is_train

        self.word_weight_dir = getattr(config, 'pretrain_word_weight_dir',
                                       None)
        if self.word_weight_dir is None:
            log.warn('word_weight_dir is None')

        self.losses = {}
        self.report = {}
        self.mid_result = {}
        self.output = {}
        self.heavy_output = {}
        self.vis_image = {}

        self.vocab = cPickle.load(open(config.vocab_path, 'rb'))
        self.answer_dict = cPickle.load(
            open(os.path.join(config.tf_record_dir, 'answer_dict.pkl'), 'rb'))
        self.num_answer = len(self.answer_dict['vocab'])
        self.num_train_answer = self.answer_dict['num_train_answer']
        self.train_answer_mask = tf.expand_dims(tf.sequence_mask(
            self.num_train_answer, maxlen=self.num_answer, dtype=tf.float32),
                                                axis=0)
        self.test_answer_mask = 1.0 - self.train_answer_mask
        self.obj_answer_mask = tf.expand_dims(tf.constant(
            self.answer_dict['is_object'], dtype=tf.float32),
                                              axis=0)
        self.attr_answer_mask = tf.expand_dims(tf.constant(
            self.answer_dict['is_attribute'], dtype=tf.float32),
                                               axis=0)

        self.glove_map = modules.LearnGloVe(self.vocab)
        self.answer_exist_mask = modules.AnswerExistMask(
            self.answer_dict, self.word_weight_dir)
        self.answer_non_exist_mask = 1.0 - self.answer_exist_mask

        if self.config.debug:
            self.features, self.spatials, self.normal_boxes, self.num_boxes, \
                self.max_box_num, self.vfeat_dim = get_dummy_data()
        elif image_features is None:
            log.infov('loading image features...')
            with h5py.File(config.vfeat_path, 'r') as f:
                self.features = np.array(f.get('image_features'))
                log.infov('feature done')
                self.spatials = np.array(f.get('spatial_features'))
                log.infov('spatials done')
                self.normal_boxes = np.array(f.get('normal_boxes'))
                log.infov('normal_boxes done')
                self.num_boxes = np.array(f.get('num_boxes'))
                log.infov('num_boxes done')
                self.max_box_num = int(f['data_info']['max_box_num'].value)
                self.vfeat_dim = int(f['data_info']['vfeat_dim'].value)
            log.infov('done')
        else:
            self.features = image_features['features']
            self.spatials = image_features['spatials']
            self.normal_boxes = image_features['normal_boxes']
            self.num_boxes = image_features['num_boxes']
            self.max_box_num = image_features['max_box_num']
            self.vfeat_dim = image_features['vfeat_dim']

        self.build()
ckpt_name = os.path.basename(config.checkpoint)
config.save_dir = os.path.join(ckpt_dir, 'word_weights_{}'.format(ckpt_name))
if not os.path.exists(config.save_dir):
    log.warn('create directory: {}'.format(config.save_dir))
    os.makedirs(config.save_dir)
else:
    raise ValueError('Do not overwrite: {}'.format(config.save_dir))

vocab_path = os.path.join(config.data_dir, 'vocab.pkl')
vocab = cPickle.load(open(vocab_path, 'rb'))

answer_dict_path = os.path.join(config.data_dir, 'answer_dict.pkl')
answer_dict = cPickle.load(open(answer_dict_path, 'rb'))
num_answer = len(answer_dict['vocab'])

v_word_map = modules.LearnGloVe(vocab, scope='V_GloVe')
l_word_map = modules.LearnGloVe(vocab, scope='L_GloVe')
l_answer_word_map = modules.LearnAnswerGloVe(answer_dict)

with tf.variable_scope('classifier/fc', reuse=tf.AUTO_REUSE):
    # (float32_ref 2048x4000) [8192000, bytes: 32768000]
    class_weights = tf.get_variable('weights',
                                    shape=[config.class_feat_dim, num_answer])
    # (float32_ref 4000) [4000, bytes: 16000]
    class_biases = tf.get_variable('biases', shape=[num_answer])

session_config = tf.ConfigProto(allow_soft_placement=True,
                                gpu_options=tf.GPUOptions(allow_growth=True),
                                device_count={'GPU': 1})
sess = tf.Session(config=session_config)
    def build(self):
        """
        build network architecture and loss
        """
        """
        Visual features
        """
        with tf.device('/cpu:0'):

            def load_feature(image_idx):
                selected_features = np.take(self.features, image_idx, axis=0)
                return selected_features

            V_ft = tf.py_func(load_feature,
                              inp=[self.batch['image_idx']],
                              Tout=tf.float32,
                              name='sample_features')
            V_ft.set_shape([None, self.max_box_num, self.vfeat_dim])
            num_V_ft = tf.gather(self.num_boxes,
                                 self.batch['image_idx'],
                                 name='gather_num_V_ft',
                                 axis=0)
            self.mid_result['num_V_ft'] = num_V_ft
            normal_boxes = tf.gather(self.normal_boxes,
                                     self.batch['image_idx'],
                                     name='gather_normal_boxes',
                                     axis=0)
            self.mid_result['normal_boxes'] = normal_boxes

        log.warning('v_linear_v')
        v_linear_v = modules.fc_layer(V_ft,
                                      V_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='v_linear_v')
        """
        Encode question
        """
        q_embed = tf.nn.embedding_lookup(self.glove_map,
                                         self.batch['q_intseq'])
        # [bs, L_DIM]
        q_L_ft = modules.encode_L(q_embed,
                                  self.batch['q_intseq_len'],
                                  L_DIM,
                                  cell_type='GRU')
        self.heavy_output['condition'] = q_L_ft

        # [bs, V_DIM}
        log.warning('q_linear_v')
        q_linear_v = modules.fc_layer(q_L_ft,
                                      V_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='q_linear_v')
        """
        Perform attention
        """
        att_score = modules.hadamard_attention(v_linear_v,
                                               num_V_ft,
                                               q_linear_v,
                                               use_ln=False,
                                               is_train=self.is_train)
        self.output['att_score'] = att_score
        self.mid_result['att_score'] = att_score
        pooled_V_ft = modules.attention_pooling(V_ft, att_score)
        """
        Answer classification
        """
        # perform two layer feature encoding and predict output
        with tf.variable_scope('reasoning') as scope:
            log.warning(scope.name)
            # [bs, L_DIM]
            log.warning('pooled_linear_l')
            pooled_linear_l = modules.fc_layer(pooled_V_ft,
                                               L_DIM,
                                               use_bias=True,
                                               use_bn=False,
                                               use_ln=True,
                                               activation_fn=tf.nn.relu,
                                               is_training=self.is_train,
                                               scope='pooled_linear_l')

            log.warning('q_linear_l')
            q_linear_l = modules.fc_layer(q_L_ft,
                                          L_DIM,
                                          use_bias=True,
                                          use_bn=False,
                                          use_ln=True,
                                          activation_fn=tf.nn.relu,
                                          is_training=self.is_train,
                                          scope='q_linear_l')

            joint = modules.fc_layer(pooled_linear_l * q_linear_l,
                                     2048,
                                     use_bias=True,
                                     use_bn=False,
                                     use_ln=True,
                                     activation_fn=tf.nn.relu,
                                     is_training=self.is_train,
                                     scope='joint_fc')
            joint = tf.nn.dropout(joint, 0.5)

            joint2 = modules.fc_layer(joint,
                                      300,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=False,
                                      activation_fn=None,
                                      is_training=self.is_train,
                                      scope='classifier')

            output_glove = modules.LearnGloVe(self.answer_dict,
                                              learnable=False,
                                              oov_mean_initialize=True)
            logit = tf.matmul(joint2, output_glove)

        self.output['logit'] = logit
        """
        Compute loss and accuracy
        """
        with tf.name_scope('loss'):
            answer_target = self.batch['answer_target']
            loss = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=answer_target, logits=logit)
            train_loss = tf.reduce_mean(
                tf.reduce_sum(loss * self.train_answer_mask, axis=-1))
            report_loss = tf.reduce_mean(tf.reduce_sum(loss, axis=-1))

            pred = tf.cast(tf.argmax(logit, axis=-1), dtype=tf.int32)
            one_hot_pred = tf.one_hot(pred,
                                      depth=self.num_answer,
                                      dtype=tf.float32)
            self.output['pred'] = pred
            all_score = tf.reduce_sum(one_hot_pred * answer_target, axis=-1)
            max_train_score = tf.reduce_max(answer_target *
                                            self.train_answer_mask,
                                            axis=-1)
            test_obj_score = tf.reduce_sum(one_hot_pred * answer_target *
                                           self.test_answer_mask *
                                           self.obj_answer_mask,
                                           axis=-1)
            test_obj_max_score = tf.reduce_max(
                answer_target * self.test_answer_mask * self.obj_answer_mask,
                axis=-1)
            test_attr_score = tf.reduce_sum(one_hot_pred * answer_target *
                                            self.test_answer_mask *
                                            self.attr_answer_mask,
                                            axis=-1)
            test_attr_max_score = tf.reduce_max(
                answer_target * self.test_answer_mask * self.attr_answer_mask,
                axis=-1)
            self.output['test_obj_score'] = test_obj_score
            self.output['test_obj_max_score'] = test_obj_max_score
            self.output['test_attr_score'] = test_attr_score
            self.output['test_attr_max_score'] = test_attr_max_score
            self.output['all_score'] = all_score
            self.output['max_train_score'] = max_train_score

            acc = tf.reduce_mean(
                tf.reduce_sum(one_hot_pred * answer_target, axis=-1))
            exist_acc = tf.reduce_mean(
                tf.reduce_sum(one_hot_pred * answer_target *
                              self.answer_exist_mask,
                              axis=-1))
            test_acc = tf.reduce_mean(
                tf.reduce_sum(one_hot_pred * answer_target *
                              self.test_answer_mask,
                              axis=-1))
            test_obj_acc = tf.reduce_mean(test_obj_score)
            test_attr_acc = tf.reduce_mean(test_attr_score)
            train_exist_acc = tf.reduce_mean(
                tf.reduce_sum(one_hot_pred * answer_target *
                              self.answer_exist_mask * self.train_answer_mask,
                              axis=-1))
            max_exist_answer_acc = tf.reduce_mean(
                tf.reduce_max(answer_target * self.answer_exist_mask, axis=-1))
            max_train_exist_acc = tf.reduce_mean(
                tf.reduce_max(answer_target * self.answer_exist_mask *
                              self.train_answer_mask,
                              axis=-1))
            test_obj_max_acc = tf.reduce_mean(test_obj_max_score)
            test_attr_max_acc = tf.reduce_mean(test_attr_max_score)
            test_max_answer_acc = tf.reduce_mean(
                tf.reduce_max(answer_target * self.test_answer_mask, axis=-1))
            test_max_exist_answer_acc = tf.reduce_mean(
                tf.reduce_max(answer_target * self.answer_exist_mask *
                              self.test_answer_mask,
                              axis=-1))
            normal_test_obj_acc = tf.where(tf.equal(test_obj_max_acc,
                                                    0), test_obj_max_acc,
                                           test_obj_acc / test_obj_max_acc)
            normal_test_attr_acc = tf.where(tf.equal(test_attr_max_acc,
                                                     0), test_attr_max_acc,
                                            test_attr_acc / test_attr_max_acc)
            normal_train_exist_acc = tf.where(
                tf.equal(max_train_exist_acc, 0), max_train_exist_acc,
                train_exist_acc / max_train_exist_acc)
            normal_exist_acc = tf.where(tf.equal(max_exist_answer_acc,
                                                 0), max_exist_answer_acc,
                                        exist_acc / max_exist_answer_acc)
            normal_test_acc = tf.where(tf.equal(test_max_answer_acc,
                                                0), test_max_answer_acc,
                                       test_acc / test_max_answer_acc)

            self.mid_result['pred'] = pred

            self.losses['answer'] = train_loss
            self.report['answer_train_loss'] = train_loss
            self.report['answer_report_loss'] = report_loss
            self.report['answer_acc'] = acc
            self.report['exist_acc'] = exist_acc
            self.report['test_acc'] = test_acc
            self.report['normal_test_acc'] = normal_test_acc
            self.report['normal_test_object_acc'] = normal_test_obj_acc
            self.report['normal_test_attribute_acc'] = normal_test_attr_acc
            self.report['normal_exist_acc'] = normal_exist_acc
            self.report['normal_train_exist_acc'] = normal_train_exist_acc
            self.report['max_exist_acc'] = max_exist_answer_acc
            self.report['test_max_acc'] = test_max_answer_acc
            self.report['test_max_exist_acc'] = test_max_exist_answer_acc
        """
        Prepare image summary
        """
        self.loss = self.losses['answer']

        # scalar summary
        for key, val in self.report.items():
            tf.summary.scalar('train/{}'.format(key),
                              val,
                              collections=['heavy_train', 'train'])
            tf.summary.scalar('val/{}'.format(key),
                              val,
                              collections=['heavy_val', 'val'])
            tf.summary.scalar('testval/{}'.format(key),
                              val,
                              collections=['heavy_testval', 'testval'])

        # image summary
        for key, val in self.vis_image.items():
            tf.summary.image('train-{}'.format(key),
                             val,
                             max_outputs=10,
                             collections=['heavy_train'])
            tf.summary.image('val-{}'.format(key),
                             val,
                             max_outputs=10,
                             collections=['heavy_val'])
            tf.summary.image('testval-{}'.format(key),
                             val,
                             max_outputs=10,
                             collections=['heavy_testval'])

        return self.loss