def __getitem__(self, idx): ent = self.data[idx] sent_id = ent['sentence']['sent_id'] file_name = ent['image_info']['file_name'] img_id = ent['image_id'] ans = ent['category_id'] W = ent['image_info']['width'] H = ent['image_info']['height'] que = ent['sentence']['sent'] gtbox = ent['gtbox'] gtbox = torch.tensor(gtbox) #boxes from refcoc is in xywh format gtboxorig = convert_xywh_x1y1x2y2(gtbox.unsqueeze(0)).squeeze(0) box_coords = ent['boxes'] box_coords = torch.tensor(box_coords) L, W, H, box_feats, box_coords_6d, box_coordsorig = self._load_image_coco( img_id) box_coords_6d = torch.from_numpy(box_coords_6d) #boxes in h4files are in x1 y1 x2 y2 format iou = getIOU(gtboxorig.unsqueeze(0), torch.from_numpy(box_coordsorig)) correct = iou > 0.5 _, idx = torch.max(iou, dim=0) # print (iou,iou.shape,box_coordsorig,"index",idx) gtboxiou = box_coordsorig[idx] gtboxiou = torch.from_numpy(gtboxiou) tokens = tokenize_ques(self.dictionary, que) qfeat = torch.from_numpy(tokens).long() Lvec = torch.zeros(100).long() Lvec[:L] = 1 return sent_id, ans, box_feats, box_coordsorig, box_coords_6d.float( ), gtboxorig.float(), qfeat, Lvec, idx, correct.view(-1)
def __getitem__(self, idx): ent = self.data[idx] sent_id = ent['question_id'] img_id = ent['image_id'] ans = 0 W = ent['width'] H = ent['height'] que = ent['question'] #0 to N boxes gtbox = ent['gtbox'] L_gtboxes = len(gtbox) Max_box = 15 #the max number of boxes in VQD if len(gtbox[0]) == 0: gtbox = [[0, 0, 1, 1.0]] * Max_box else: gtbox = gtbox + [[0, 0, 1, 1.0]] * (Max_box - L_gtboxes) gtbox = torch.tensor(gtbox).float() #boxes from refcoc is in xywh format gtboxorig = convert_xywh_x1y1x2y2(gtbox) L, W, H, box_feats, box_coords_6d, box_coordsorig = self._load_image_coco( img_id) box_coords_6d = torch.from_numpy(box_coords_6d) #boxes in h4files are in x1 y1 x2 y2 format iou = getIOU(gtboxorig.unsqueeze(1), torch.from_numpy(box_coordsorig)).squeeze(-1) correct = iou > 0.5 correct = correct.sum(dim=0).clamp(max=1) _, idxall = torch.max(iou, dim=1) #maybe more than one indices so sample for now idx = torch.tensor([int(np.random.choice(idxall))]) idx = torch.tensor([int(idxall[0])]) # print (iou,iou.shape,box_coordsorig,"index",idx) gtboxiou = box_coordsorig[idx] gtboxiou = torch.from_numpy(gtboxiou) tokens = tokenize_ques(self.dictionary, que) qfeat = torch.from_numpy(tokens).long() #tortal number of entries N = box_coordsorig.shape[0] Lvec = torch.zeros(N).long() Lvec[:L] = 1 return sent_id,ans,box_feats,box_coordsorig,box_coords_6d.float(),\ gtboxorig[0].float(),qfeat,Lvec,idx,correct.view(-1)
def __getitem__(self, idx): ent = self.data[idx] sent_id = ent['sentence']['sent_id'] file_name = ent['image_info']['file_name'] img_id = ent['image_id'] ans = ent['category_id'] W = ent['image_info']['width'] H = ent['image_info']['height'] que = ent['sentence']['sent'] gtbox = ent['gtbox'] gtbox = torch.tensor(gtbox) #boxes from refcoc is in xywh format gtboxorig = convert_xywh_x1y1x2y2(gtbox.unsqueeze(0)).squeeze(0) box_coords = ent['boxes'] box_coords = torch.tensor(box_coords) if 'train' in file_name: pk = pickle.load(open(osp.join("feats","train2014",file_name+".pkl"),"rb")) elif 'val' in file_name: pk = pickle.load(open(osp.join("feats","val2014",file_name+".pkl"),"rb")) L = len(pk) - 1 # lenght of entries in pickle file if L<=0: L = 1 N = 20 # maximum entries to use L = 20 # uncomment if you want variable L if L>N: L=N box_feats = np.zeros((N,2048),dtype=np.float32) box_coords = np.zeros((N,4)) for i,ent in enumerate(pk[:-1]): if i == N: break box_feats[i,:] = ent['feat'].flatten() box_coords[i,:] = np.array(ent['coords']) lastent = pk[-1] wholefeat = lastent['image'] W = lastent['w'] H = lastent['h'] box_feats = torch.from_numpy(box_feats) box_coordsorig = box_coords box_coords_6d = self._process_boxes(box_coords,W,H) box_coords_6d = torch.from_numpy(box_coords_6d) #boxes in h4files are in x1 y1 x2 y2 format iou = getIOU(gtboxorig.unsqueeze(0),torch.from_numpy(box_coordsorig).float()) correct = iou>0.5 _,idx = torch.max(iou,dim=0) # print (iou,iou.shape,box_coordsorig,"index",idx) gtboxiou = box_coordsorig[idx] gtboxiou = torch.from_numpy(gtboxiou) tokens = tokenize_ques(self.dictionary,que) qfeat = torch.from_numpy(tokens).long() Lvec = torch.zeros(N).long() Lvec[:L] = 1 return sent_id,ans,box_feats,box_coordsorig,box_coords_6d.float(),\ gtboxorig.float(),qfeat,Lvec,idx,correct.view(-1)