Ejemplo n.º 1
0
    def __init__(self, imdb, data_params):
        self.imdb = imdb
        self.data_params = data_params

        self.vocab_dict = text_processing.VocabDict(
            data_params['vocab_question_file'])
        self.T_encoder = data_params['T_encoder']

        # peek one example to see whether answer is in the data
        self.load_answer = ('answer' in self.imdb[0])
        # peek one example to see whether bbox is in the data
        self.load_bbox = ('bbox' in self.imdb[0])
        # the answer dict is always loaded, regardless of self.load_answer
        self.answer_dict = text_processing.VocabDict(
            data_params['vocab_answer_file'])
        if not (self.load_answer or self.load_bbox):
            print('imdb has no answer labels or bbox. Using dummy labels.\n\n'
                  '**The final accuracy will be zero (no labels provided)**\n')

        # positional encoding
        self.add_pos_enc = data_params.get('add_pos_enc', False)
        self.pos_enc_dim = data_params.get('pos_enc_dim', 0)
        assert self.pos_enc_dim % 4 == 0, \
            'positional encoding dim must be a multiply of 4'
        self.pos_enc_scale = data_params.get('pos_enc_scale', 1.)

        self.load_spatial_feature = data_params['load_spatial_feature']
        if self.load_spatial_feature:
            spatial_feature_dir = data_params['spatial_feature_dir']
            self.spatial_loader = SpatialFeatureLoader(spatial_feature_dir)
            # load one feature map to peek its size
            x = self.spatial_loader.load_feature(self.imdb[0]['imageId'])
            self.spatial_D, self.spatial_H, self.spatial_W = x.shape
            # positional encoding
            self.pos_enc = self.pos_enc_scale * get_positional_encoding(
                self.spatial_H, self.spatial_W, self.pos_enc_dim)

        if self.load_bbox:
            self.img_H = data_params['img_H']
            self.img_W = data_params['img_W']
            self.stride_H = self.img_H * 1. / self.spatial_H
            self.stride_W = self.img_W * 1. / self.spatial_W
Ejemplo n.º 2
0
    def __init__(self, imdb, data_params):
        self.imdb = imdb
        self.data_params = data_params

        self.vocab_dict = text_processing.VocabDict(
            data_params['vocab_question_file'])
        self.T_encoder = data_params['T_encoder']
        self.N_encoder = data_params['N_encoder']
        self.O_encoder = data_params['O_encoder']
        # peek one example to see whether answer is in the data
        self.load_answer = ('answer' in self.imdb[0])
        # the answer dict is always loaded, regardless of self.load_answer
        self.answer_dict = text_processing.VocabDict(
            data_params['vocab_answer_file'])
        if not self.load_answer:
            print('imdb has no answer labels. Using dummy labels.\n\n'
                  '**The final accuracy will be zero (no labels provided)**\n')

        #self.nlp = spacy.load('en_core_web_lg')
        # positional encoding
        self.add_pos_enc = data_params.get('add_pos_enc', False)
        self.pos_enc_dim = data_params.get('pos_enc_dim', 0)
        assert self.pos_enc_dim % 4 == 0, \
            'positional encoding dim must be a multiply of 4'
        self.pos_enc_scale = data_params.get('pos_enc_scale', 1.)

        self.load_spatial_feature = False
        self.load_objects_feature = False
        self.load_scene_graph_feature = True
        feature_type = data_params['feature_type']
        if feature_type == 'spatial':
            self.load_spatial_feature = True
        elif feature_type == 'objects':
            self.load_objects_feature = True
        elif feature_type == 'scene_graph':
            self.load_scene_graph_feature = True
        else:
            raise ValueError('Unknown feature type: %s' % feature_type)

        if self.load_spatial_feature:
            spatial_feature_dir = data_params['spatial_feature_dir']
            self.spatial_loader = SpatialFeatureLoader(spatial_feature_dir)
            # load one feature map to peek its size
            x = self.spatial_loader.load_feature(self.imdb[0]['imageId'])
            self.spatial_D, self.spatial_H, self.spatial_W = x.shape
            # positional encoding
            self.pos_enc = self.pos_enc_scale * get_positional_encoding(
                self.spatial_H, self.spatial_W, self.pos_enc_dim)

        if self.load_objects_feature:
            objects_feature_dir = data_params['objects_feature_dir']
            self.objects_loader = ObjectsFeatureLoader(objects_feature_dir)
            # load one feature map to peek its size
            self.objects_M = data_params.get('objects_max_num', 100)
            x, _ = self.objects_loader.load_feature(self.imdb[0]['imageId'])
            _, self.objects_D = x.shape

        if self.load_scene_graph_feature:
            scene_graph_file = data_params['scene_graph_file']
            vocab_name_file = data_params['vocab_name_file']
            vocab_attr_file = data_params['vocab_attr_file']
            self.objects_M = data_params.get('objects_max_num', 100)
            self.scene_graph_loader = SceneGraphFeatureLoader(
                scene_graph_file,
                vocab_name_file,
                vocab_attr_file,
                max_num=self.objects_M)
            if feature_type == 'scene_graph':
                # load one feature map to peek its size
                x, _, _ = self.scene_graph_loader.load_feature_normalized_bbox(
                    self.imdb[0]['imageId'])
                _, self.objects_D = x.shape
            else:
                self.load_scene_graph_feature = False

        self.se_max_len = -1
        self.se_zero_len = 0
        self.se_count = Counter()
        self.stop_words = ['of', 'the', 'to', 'on', 'in', 'at', 'a', 'and']
    def __init__(self, imdb, data_params):
        self.imdb = imdb
        self.data_params = data_params

        self.vocab_dict = text_processing.VocabDict(
            data_params['vocab_question_file'])
        self.T_encoder = data_params['T_encoder']

        # peek one example to see whether answer is in the data
        self.load_answer = ('answer' in self.imdb[0])
        # the answer dict is always loaded, regardless of self.load_answer
        self.answer_dict = text_processing.VocabDict(
            data_params['vocab_answer_file'])
        if not self.load_answer:
            print('imdb does not contain answers')

        self.load_spatial_feature = False
        self.load_objects_feature = False
        self.load_scene_graph_feature = False
        feature_type = data_params['feature_type']
        if feature_type == 'spatial':
            self.load_spatial_feature = True
        elif feature_type == 'objects':
            self.load_objects_feature = True
        elif feature_type == 'scene_graph':
            self.load_scene_graph_feature = True
        else:
            raise ValueError('Unknown feature type: %s' % feature_type)

        if self.load_spatial_feature:
            spatial_feature_dir = data_params['spatial_feature_dir']
            self.spatial_loader = SpatialFeatureLoader(spatial_feature_dir)
            # load one feature map to peek its size
            x = self.spatial_loader.load_feature(self.imdb[0]['imageId'])
            self.spatial_D, self.spatial_H, self.spatial_W = x.shape
            # positional encoding
            self.spatial_pos_enc_dim = data_params['spatial_pos_enc_dim']
            self.pos_enc = get_positional_encoding(self.spatial_H,
                                                   self.spatial_W,
                                                   self.spatial_pos_enc_dim)

        if self.load_objects_feature:
            objects_feature_dir = data_params['objects_feature_dir']
            self.objects_M = data_params['objects_max_num']
            self.objects_loader = ObjectsFeatureLoader(objects_feature_dir)
            # load one feature map to peek its size
            x, _ = self.objects_loader.load_feature(self.imdb[0]['imageId'])
            _, self.objects_D = x.shape
            self.bbox_tile_num = data_params['bbox_tile_num']

        if self.load_scene_graph_feature:
            scene_graph_file = data_params['scene_graph_file']
            vocab_name_file = data_params['vocab_name_file']
            vocab_attr_file = data_params['vocab_attr_file']
            self.objects_M = data_params['objects_max_num']
            self.scene_graph_loader = SceneGraphFeatureLoader(
                scene_graph_file,
                vocab_name_file,
                vocab_attr_file,
                max_num=self.objects_M)
            # load one feature map to peek its size
            x, _, _ = self.scene_graph_loader.load_feature_normalized_bbox(
                self.imdb[0]['imageId'])
            _, self.objects_D = x.shape
            self.bbox_tile_num = data_params['bbox_tile_num']