def __init__(self, imdb, data_params): self.imdb = imdb self.data_params = data_params self.vocab_dict = text_processing.VocabDict( data_params['vocab_question_file']) self.T_encoder = data_params['T_encoder'] # peek one example to see whether answer is in the data self.load_answer = ('answer' in self.imdb[0]) # peek one example to see whether bbox is in the data self.load_bbox = ('bbox' in self.imdb[0]) # the answer dict is always loaded, regardless of self.load_answer self.answer_dict = text_processing.VocabDict( data_params['vocab_answer_file']) if not (self.load_answer or self.load_bbox): print('imdb has no answer labels or bbox. Using dummy labels.\n\n' '**The final accuracy will be zero (no labels provided)**\n') # positional encoding self.add_pos_enc = data_params.get('add_pos_enc', False) self.pos_enc_dim = data_params.get('pos_enc_dim', 0) assert self.pos_enc_dim % 4 == 0, \ 'positional encoding dim must be a multiply of 4' self.pos_enc_scale = data_params.get('pos_enc_scale', 1.) self.load_spatial_feature = data_params['load_spatial_feature'] if self.load_spatial_feature: spatial_feature_dir = data_params['spatial_feature_dir'] self.spatial_loader = SpatialFeatureLoader(spatial_feature_dir) # load one feature map to peek its size x = self.spatial_loader.load_feature(self.imdb[0]['imageId']) self.spatial_D, self.spatial_H, self.spatial_W = x.shape # positional encoding self.pos_enc = self.pos_enc_scale * get_positional_encoding( self.spatial_H, self.spatial_W, self.pos_enc_dim) if self.load_bbox: self.img_H = data_params['img_H'] self.img_W = data_params['img_W'] self.stride_H = self.img_H * 1. / self.spatial_H self.stride_W = self.img_W * 1. / self.spatial_W
def __init__(self, imdb, data_params): self.imdb = imdb self.data_params = data_params self.vocab_dict = text_processing.VocabDict( data_params['vocab_question_file']) self.T_encoder = data_params['T_encoder'] self.N_encoder = data_params['N_encoder'] self.O_encoder = data_params['O_encoder'] # peek one example to see whether answer is in the data self.load_answer = ('answer' in self.imdb[0]) # the answer dict is always loaded, regardless of self.load_answer self.answer_dict = text_processing.VocabDict( data_params['vocab_answer_file']) if not self.load_answer: print('imdb has no answer labels. Using dummy labels.\n\n' '**The final accuracy will be zero (no labels provided)**\n') #self.nlp = spacy.load('en_core_web_lg') # positional encoding self.add_pos_enc = data_params.get('add_pos_enc', False) self.pos_enc_dim = data_params.get('pos_enc_dim', 0) assert self.pos_enc_dim % 4 == 0, \ 'positional encoding dim must be a multiply of 4' self.pos_enc_scale = data_params.get('pos_enc_scale', 1.) self.load_spatial_feature = False self.load_objects_feature = False self.load_scene_graph_feature = True feature_type = data_params['feature_type'] if feature_type == 'spatial': self.load_spatial_feature = True elif feature_type == 'objects': self.load_objects_feature = True elif feature_type == 'scene_graph': self.load_scene_graph_feature = True else: raise ValueError('Unknown feature type: %s' % feature_type) if self.load_spatial_feature: spatial_feature_dir = data_params['spatial_feature_dir'] self.spatial_loader = SpatialFeatureLoader(spatial_feature_dir) # load one feature map to peek its size x = self.spatial_loader.load_feature(self.imdb[0]['imageId']) self.spatial_D, self.spatial_H, self.spatial_W = x.shape # positional encoding self.pos_enc = self.pos_enc_scale * get_positional_encoding( self.spatial_H, self.spatial_W, self.pos_enc_dim) if self.load_objects_feature: objects_feature_dir = data_params['objects_feature_dir'] self.objects_loader = ObjectsFeatureLoader(objects_feature_dir) # load one feature map to peek its size self.objects_M = data_params.get('objects_max_num', 100) x, _ = self.objects_loader.load_feature(self.imdb[0]['imageId']) _, self.objects_D = x.shape if self.load_scene_graph_feature: scene_graph_file = data_params['scene_graph_file'] vocab_name_file = data_params['vocab_name_file'] vocab_attr_file = data_params['vocab_attr_file'] self.objects_M = data_params.get('objects_max_num', 100) self.scene_graph_loader = SceneGraphFeatureLoader( scene_graph_file, vocab_name_file, vocab_attr_file, max_num=self.objects_M) if feature_type == 'scene_graph': # load one feature map to peek its size x, _, _ = self.scene_graph_loader.load_feature_normalized_bbox( self.imdb[0]['imageId']) _, self.objects_D = x.shape else: self.load_scene_graph_feature = False self.se_max_len = -1 self.se_zero_len = 0 self.se_count = Counter() self.stop_words = ['of', 'the', 'to', 'on', 'in', 'at', 'a', 'and']
def __init__(self, imdb, data_params): self.imdb = imdb self.data_params = data_params self.vocab_dict = text_processing.VocabDict( data_params['vocab_question_file']) self.T_encoder = data_params['T_encoder'] # peek one example to see whether answer is in the data self.load_answer = ('answer' in self.imdb[0]) # the answer dict is always loaded, regardless of self.load_answer self.answer_dict = text_processing.VocabDict( data_params['vocab_answer_file']) if not self.load_answer: print('imdb does not contain answers') self.load_spatial_feature = False self.load_objects_feature = False self.load_scene_graph_feature = False feature_type = data_params['feature_type'] if feature_type == 'spatial': self.load_spatial_feature = True elif feature_type == 'objects': self.load_objects_feature = True elif feature_type == 'scene_graph': self.load_scene_graph_feature = True else: raise ValueError('Unknown feature type: %s' % feature_type) if self.load_spatial_feature: spatial_feature_dir = data_params['spatial_feature_dir'] self.spatial_loader = SpatialFeatureLoader(spatial_feature_dir) # load one feature map to peek its size x = self.spatial_loader.load_feature(self.imdb[0]['imageId']) self.spatial_D, self.spatial_H, self.spatial_W = x.shape # positional encoding self.spatial_pos_enc_dim = data_params['spatial_pos_enc_dim'] self.pos_enc = get_positional_encoding(self.spatial_H, self.spatial_W, self.spatial_pos_enc_dim) if self.load_objects_feature: objects_feature_dir = data_params['objects_feature_dir'] self.objects_M = data_params['objects_max_num'] self.objects_loader = ObjectsFeatureLoader(objects_feature_dir) # load one feature map to peek its size x, _ = self.objects_loader.load_feature(self.imdb[0]['imageId']) _, self.objects_D = x.shape self.bbox_tile_num = data_params['bbox_tile_num'] if self.load_scene_graph_feature: scene_graph_file = data_params['scene_graph_file'] vocab_name_file = data_params['vocab_name_file'] vocab_attr_file = data_params['vocab_attr_file'] self.objects_M = data_params['objects_max_num'] self.scene_graph_loader = SceneGraphFeatureLoader( scene_graph_file, vocab_name_file, vocab_attr_file, max_num=self.objects_M) # load one feature map to peek its size x, _, _ = self.scene_graph_loader.load_feature_normalized_bbox( self.imdb[0]['imageId']) _, self.objects_D = x.shape self.bbox_tile_num = data_params['bbox_tile_num']