def __init__(self, opt): super(BoxSampler, self).__init__() self.opt = {} self.low_thresh = utils.getopt(opt, 'low_thresh', 0.4) self.high_thresh = utils.getopt(opt, 'high_thresh', 0.75) self.batch_size = utils.getopt(opt, 'batch_size', 256) self.debug = utils.getopt(opt, 'debug', False) self.iou = boxIoU.BoxIoU() self.x_min, self.x_max = None, None self.y_min, self.y_max = None, None
def __init__(self, opt): super(LocalizationLayer, self).__init__() self.opt = easydict.EasyDict() self.opt.input_dim = utils.getopt(opt, 'input_dim') self.opt.output_size = utils.getopt(opt, 'output_size') self.opt.sampler_batch_size = utils.getopt(opt, 'sampler_batch_size') self.opt.sampler_high_thresh = utils.getopt(opt, 'sampler_high_thresh') self.opt.sampler_low_thresh = utils.getopt(opt, 'sampler_low_thresh') self.opt.train_remove_outbounds_boxes = utils.getopt( opt, 'train_remove_outbounds_boxes', 1) self.opt.contrastive_loss = utils.getopt(opt, 'contrastive_loss') sampler_opt = { 'batch_size': self.opt.sampler_batch_size, 'low_thresh': self.opt.sampler_low_thresh, 'high_thresh': self.opt.sampler_high_thresh, 'contrastive_loss': self.opt.contrastive_loss } self.box_sampler_helper = BoxSamplerHelper(sampler_opt) self.roi_pooling = BilinearRoiPooling(self.opt.output_size[0], self.opt.output_size[1]) self.image_height = None self.image_width = None self._called_forward_size = False self._called_backward_size = False
def __init__(self, opt): super(BoxSamplerHelper, self).__init__() if opt.has_key('box_sampler'): self.box_sampler = opt['box_sampler'] #For testing else: self.box_sampler = box_sampler.BoxSampler(opt) self.contrastive_loss = opt['contrastive_loss'] self.return_index = utils.getopt(opt, 'return_index', False)
def setTestArgs(self, args={}): self.test_clip_boxes = utils.getopt(args, 'clip_boxes', True) self.test_nms_thresh = utils.getopt(args, 'nms_thresh', 0.7) self.test_max_proposals = utils.getopt(args, 'max_proposals', 300)
def getBatch(opt): split = getopt(opt, 'split') split = str(split) batch_zie = getopt(opt, 'batch_size', 128) split_ix_tmp = self.split_ix[split] assert (split_ix_tmp, 'split ' + str(split) + ' not found') max_index = len(split_ix_tmp) - 1 ques_idx = torch.LongTensor(batch_size) img_idx = torch.LongTensor(batch_size) if self.feature_type == 'VGG': self.img_batch = torch.Tensor(batch_size, 14, 14, 512) elif self.feature_type == 'Residual': self.img_batch = torch.Tensor(batch_size, 14, 14, 2048) for i in range(0, batch_size): ri = self.iterators[split] ri_next = ri + 1 if ri_next > max_index: ri_next = 1 self.iterators[split] = ri_next if int(split) == 0: ix = split_ix_tmp[torch.randperm(max_index + 1)[0]] else: ix = split_ix_tmp[ri] assert (ix != None, 'Bug: split ' + split + ' was accessed out of bounds with ' + str(ri)) ques_idx[i] = ix if int(split) == 0 or int(split) == 1: img_idx[i] = self.img_pos_train[ix] if self.h5_img_file_train != None: if self.feature_type == 'VGG': img = self.h5_img_file_train['/images_train'][ img_idx[i] - 1:img_idx[i], 0:14, 0:14, 0:512] self.img_batch[i] = img elif self.feature_type == 'Residual': img = self.h5_img_file_train['/images_train'][ img_idx[i] - 1:img_idx[i], 0:14, 0:14, 0:2048] self.img_batch[i] = img else: print("Error(train): feature type error") else: img_idx[i] = self.img_pos_test[ix] if self.h5_img_file_test != None: if self.feature_type == 'VGG': img = self.h5_img_file_test['/images_test'][ img_idx[i] - 1:img_idx[i], 0:14, 0:14, 0:512] self.img_batch[i] = img elif self.feature_type == 'Residual': img = self.h5_img_file_test['/images_test'][ img_idx[i] - 1:img_idx[i], 0:14, 0:14, 0:2048] self.img_batch[i] = img else: print("Error(test): feature type error") data = {} data['questions'] = [] data['ques_id'] = [] data['ques_len'] = [] data['answer'] = [] if int(split) == 0 or int(split) == 1: data['images'] = np.reshape(self.img_batch, (batch_size, 196, -1)) for i in range(0, len(ques_idx)): data['questions'].append(self.ques_train[ques_idx[i]]) data['ques_id'].append(self.ques_id_train[ques_idx[i]]) data['ques_len'].append(self.ques_len_train[ques_idx[i]]) data['answer'].append(self.ans_train[ques_idx[i]]) else: data['images'] = np.reshape(self.img_batch, (batch_size, 196, -1)) for i in range(0, len(ques_idx)): data['questions'].append(self.ques_test[ques_idx[i]]) data['ques_id'].append(self.ques_id_test[ques_idx[i]]) data['ques_len'].append(self.ques_len_test[ques_idx[i]]) data['answer'].append(self.ans_test[ques_idx[i]])
def __init__(self, opt, split, root='data/'): if split == 0: self.dataset = utils.getopt(opt, 'dataset') else: self.dataset = utils.getopt(opt, 'val_dataset') root += '%s/' % self.dataset self.debug_max_train_images = utils.getopt(opt, 'debug_max_train_images', -1) self.embedding = utils.getopt(opt, 'embedding') self.fold = utils.getopt(opt, 'fold') self.image_size = opt.image_size self.split_num = split self.dtp_train = opt.dtp_train self.augment = opt.augment self.opt = opt num2split = {0: 'train', 1: 'val', 2: 'test'} self.split = num2split[split] self.train = self.split == 'train' self.alphabet = dl.default_alphabet self.ghosh = opt.ghosh if self.dataset == 'konzilsprotokolle': self.alphabet = '&' + string.digits + string.ascii_lowercase suffix = '' if self.augment: suffix = '_augmented' if self.opt.reproduce_paper: self.h5_file = 'data/reproduce/%s_fold%d.h5' % (self.dataset, self.fold) self.json_file = 'data/reproduce/%s_fold%d.json' % (self.dataset, self.fold) self.data = self._repr_load_data() else: self.data = getattr(dl, 'load_%s' % self.dataset)( fold=self.fold, alphabet=self.alphabet) self.h5_file = root + self.dataset + '%s_fold%d.h5' % (suffix, self.fold) self.json_file = root + self.dataset + '%s_fold%d.json' % ( suffix, self.fold) self.data_split = [d for d in self.data if d['split'] == self.split] if self.dataset != 'iiit_hws': self.split_vocab = utils.build_vocab(self.data_split) else: self.split_vocab = np.unique([d['label'] for d in self.data]) if self.ghosh: self.h5_file = root + 'washington_fold1_ghosh.h5' self.json_file = root + 'washington_fold1_ghosh.json' show = not opt.quiet #load the json file which contains additional information about the dataset if show: print('DataLoader loading json file: ', self.json_file) with open(self.json_file, 'r') as f: self.info = json.load(f) self.vocab_size = len(self.info['itow']) #Convert keys in idx_to_token from string to integer itow = {} for k, v in self.info['itow'].iteritems(): itow[int(k) - 1] = v self.info['itow'] = itow self.itow = itow self.wtoi = {w: i for i, w in itow.iteritems()} self.resolution = 3 #boils down to whether or not the all embeddings should match their label self.bins = len(self.alphabet) * 2 self.ngrams = 2 self.unigram_levels = range(1, 6) self.emb_func = getattr(emb, opt.embedding) self.args = (self.resolution, self.alphabet) if opt.embedding == 'ngram_dct': self.args += (self.ngrams, self.bins) elif opt.embedding == 'phoc': self.args = (self.alphabet, self.unigram_levels) if opt.embedding_loss == 'phocnet': self.wtoe = { w: self.emb_func(w, *self.args) for i, w in self.itow.iteritems() } #word embedding table else: self.wtoe = { w: self.normalize(self.emb_func(w, *self.args)) for i, w in self.itow.iteritems() } #word embedding table self.iam = self.dataset == 'iam' if self.iam: with open('data/iam/stopwords.txt') as f: tmp = f.readline()[:-1] self.stopwords = tmp.split(',') if not self.train: self.init_queries() # open the hdf5 file if show: print('DataLoader loading h5 file: ', self.h5_file) self.h5_file = h5py.File(self.h5_file, 'r') self.boxes = self.h5_file.get('boxes').value self.image_heights = self.h5_file.get('image_heights').value self.image_widths = self.h5_file.get('image_widths').value self.img_to_first_box = self.h5_file.get('img_to_first_box').value self.img_to_last_box = self.h5_file.get('img_to_last_box').value self.labels = self.h5_file.get('labels').value - 1 self.word_embedding = self.h5_file.get(self.embedding + '_word_embeddings').value self.img_to_first_rp = self.h5_file.get('img_to_first_rp').value self.img_to_last_rp = self.h5_file.get('img_to_last_rp').value self.original_heights = self.h5_file.get('original_heights').value self.original_widths = self.h5_file.get('original_widths').value self.split_inds = self.h5_file.get('split').value #dimensionality of the embedding self.embedding_dim = self.word_embedding.shape[1] #extract image size from dataset images_size = self.h5_file.get('images').shape assert len(images_size) == 4, '/images should be a 4D tensor' self.num_images = images_size[0] self.num_channels = images_size[1] self.max_image_height = images_size[2] self.max_image_width = images_size[3] #extract some attributes from the data self.num_regions = self.boxes.shape[0] self.image_mean = self.h5_file.get('/image_mean').value[0] #set up index ranges for the different splits self.train_ix = [] self.val_ix = [] self.test_ix = [] for i in range(self.num_images): if self.split_inds[i] == 0: self.train_ix.append(i) if self.split_inds[i] == 1: self.val_ix.append(i) if self.split_inds[i] == 2: self.test_ix.append(i) if show: print('assigned %d/%d/%d images to train/val/test.' % (len(self.train_ix), len(self.val_ix), len(self.test_ix))) print('initialized DataLoader:') print('#images: %d, #regions: %d' % (self.num_images, self.num_regions))
def __init__(self, opt): super(LocalizationLayer, self).__init__() self.opt = easydict.EasyDict() self.opt.input_dim = utils.getopt(opt, 'input_dim') self.opt.output_size = utils.getopt(opt, 'output_size') # list x0, y0, sx, sy self.opt.field_centers = utils.getopt(opt, 'field_centers') self.opt.mid_box_reg_weight = utils.getopt(opt, 'mid_box_reg_weight') self.opt.mid_objectness_weight = utils.getopt(opt, 'mid_objectness_weight') self.opt.rpn_filter_size = utils.getopt(opt, 'rpn_filter_size', 3) self.opt.rpn_num_filters = utils.getopt(opt, 'rpn_num_filters', 256) self.opt.zero_box_conv = utils.getopt(opt, 'zero_box_conv', True) self.opt.std = utils.getopt(opt, 'std', 0.01) self.opt.anchor_scale = utils.getopt(opt, 'anchor_scale', 1.0) self.opt.anchors = utils.getopt(opt, 'anchors', 'original') self.opt.sampler_batch_size = utils.getopt(opt, 'sampler_batch_size', 256) self.opt.sampler_high_thresh = utils.getopt(opt, 'sampler_high_thresh', 0.75) self.opt.sampler_low_thresh = utils.getopt(opt, 'sampler_low_thresh', 0.4) self.opt.train_remove_outbounds_boxes = utils.getopt( opt, 'train_remove_outbounds_boxes', 1) self.opt.box_reg_decay = utils.getopt(opt, 'box_reg_decay', 5e-5) self.opt.tunable_anchors = utils.getopt(opt, 'tunable_anchors', False) self.opt.backprop_rpn_anchors = utils.getopt(opt, 'backprop_rpn_anchors', False) self.box_loss = utils.getopt(opt, 'wordness_loss') self.opt.contrastive_loss = utils.getopt(opt, 'contrastive_loss') self.stats = easydict.EasyDict() self.stats.losses = easydict.EasyDict() self.stats.vars = easydict.EasyDict() self.dtp_train = utils.getopt(opt, 'dtp_train', False) if self.dtp_train: self.opt.sampler_batch_size /= 2 sampler_opt = { 'batch_size': self.opt.sampler_batch_size, 'low_thresh': self.opt.sampler_low_thresh, 'high_thresh': self.opt.sampler_high_thresh, 'contrastive_loss': self.opt.contrastive_loss } debug_sampler = utils.getopt(opt, 'box_sampler', False) if debug_sampler != False: sampler_opt['box_sampler'] = debug_sampler self.rpn = RPN(self.opt) self.box_sampler_helper = BoxSamplerHelper(sampler_opt) self.roi_pooling = BilinearRoiPooling(self.opt.output_size[0], self.opt.output_size[1]) self.invert_box_transform = InvertBoxTransform() # Construct criterions if self.opt.backprop_rpn_anchors: self.box_reg_loss = BoxRegressionCriterion( self.opt.mid_box_reg_weight) else: self.box_reg_loss = nn.SmoothL1Loss() # for RPN box regression self.box_scoring_loss = nn.CrossEntropyLoss() self.image_height = None self.image_width = None self._called_forward_size = False self._called_backward_size = False
def setBounds(self, bounds): self.x_min = utils.getopt(bounds, 'x_min', None) self.x_max = utils.getopt(bounds, 'x_max', None) self.y_min = utils.getopt(bounds, 'y_min', None) self.y_max = utils.getopt(bounds, 'y_max', None)