Esempio n. 1
0
    def __getitem__(self, idx):
        ent = self.data[idx]
        sent_id = ent['sentence']['sent_id']
        file_name = ent['image_info']['file_name']
        img_id = ent['image_id']
        ans = ent['category_id']
        W = ent['image_info']['width']
        H = ent['image_info']['height']
        que = ent['sentence']['sent']
        gtbox = ent['gtbox']
        gtbox = torch.tensor(gtbox)
        #boxes from refcoc is in xywh format
        gtboxorig = convert_xywh_x1y1x2y2(gtbox.unsqueeze(0)).squeeze(0)
        box_coords = ent['boxes']
        box_coords = torch.tensor(box_coords)

        L, W, H, box_feats, box_coords_6d, box_coordsorig = self._load_image_coco(
            img_id)
        box_coords_6d = torch.from_numpy(box_coords_6d)

        #boxes in h4files are in x1 y1 x2 y2 format
        iou = getIOU(gtboxorig.unsqueeze(0), torch.from_numpy(box_coordsorig))
        correct = iou > 0.5
        _, idx = torch.max(iou, dim=0)
        #        print (iou,iou.shape,box_coordsorig,"index",idx)
        gtboxiou = box_coordsorig[idx]
        gtboxiou = torch.from_numpy(gtboxiou)

        tokens = tokenize_ques(self.dictionary, que)
        qfeat = torch.from_numpy(tokens).long()
        Lvec = torch.zeros(100).long()
        Lvec[:L] = 1
        return sent_id, ans, box_feats, box_coordsorig, box_coords_6d.float(
        ), gtboxorig.float(), qfeat, Lvec, idx, correct.view(-1)
Esempio n. 2
0
    def __getitem__(self, idx):
        ent = self.data[idx]
        sent_id = ent['question_id']
        img_id = ent['image_id']
        ans = 0
        W = ent['width']
        H = ent['height']
        que = ent['question']
        #0  to N boxes
        gtbox = ent['gtbox']
        L_gtboxes = len(gtbox)
        Max_box = 15  #the max number of boxes  in VQD
        if len(gtbox[0]) == 0:
            gtbox = [[0, 0, 1, 1.0]] * Max_box
        else:
            gtbox = gtbox + [[0, 0, 1, 1.0]] * (Max_box - L_gtboxes)

        gtbox = torch.tensor(gtbox).float()
        #boxes from refcoc is in xywh format
        gtboxorig = convert_xywh_x1y1x2y2(gtbox)

        L, W, H, box_feats, box_coords_6d, box_coordsorig = self._load_image_coco(
            img_id)
        box_coords_6d = torch.from_numpy(box_coords_6d)

        #boxes in h4files are in x1 y1 x2 y2 format
        iou = getIOU(gtboxorig.unsqueeze(1),
                     torch.from_numpy(box_coordsorig)).squeeze(-1)
        correct = iou > 0.5
        correct = correct.sum(dim=0).clamp(max=1)

        _, idxall = torch.max(iou, dim=1)
        #maybe more than one indices so sample for now
        idx = torch.tensor([int(np.random.choice(idxall))])
        idx = torch.tensor([int(idxall[0])])

        #        print (iou,iou.shape,box_coordsorig,"index",idx)
        gtboxiou = box_coordsorig[idx]
        gtboxiou = torch.from_numpy(gtboxiou)

        tokens = tokenize_ques(self.dictionary, que)
        qfeat = torch.from_numpy(tokens).long()
        #tortal number of entries
        N = box_coordsorig.shape[0]
        Lvec = torch.zeros(N).long()
        Lvec[:L] = 1
        return sent_id,ans,box_feats,box_coordsorig,box_coords_6d.float(),\
                gtboxorig[0].float(),qfeat,Lvec,idx,correct.view(-1)
Esempio n. 3
0
    def __getitem__(self, idx):
        ent = self.data[idx]
        sent_id = ent['sentence']['sent_id']
        file_name = ent['image_info']['file_name']
        img_id = ent['image_id']
        ans = ent['category_id']  
        W = ent['image_info']['width']
        H = ent['image_info']['height']
        que = ent['sentence']['sent']                  
        gtbox = ent['gtbox']
        gtbox = torch.tensor(gtbox)
        #boxes from refcoc is in xywh format
        gtboxorig = convert_xywh_x1y1x2y2(gtbox.unsqueeze(0)).squeeze(0)
        box_coords = ent['boxes']
        box_coords = torch.tensor(box_coords)
        
        
        if 'train' in file_name:
            pk = pickle.load(open(osp.join("feats","train2014",file_name+".pkl"),"rb"))
        elif 'val' in file_name:
            pk = pickle.load(open(osp.join("feats","val2014",file_name+".pkl"),"rb"))    
            
        L =  len(pk) - 1 # lenght of entries in pickle file

        if L<=0:
            L = 1           
            
        N =  20  # maximum entries to use
        L = 20   # uncomment if you want variable L 
        if L>N:
            L=N   
            
        box_feats = np.zeros((N,2048),dtype=np.float32)    
        box_coords = np.zeros((N,4))
        for i,ent in enumerate(pk[:-1]):
            if i == N:
                break
            box_feats[i,:] = ent['feat'].flatten()
            box_coords[i,:] = np.array(ent['coords'])
            
        lastent = pk[-1]
        wholefeat = lastent['image']
        W = lastent['w']
        H = lastent['h']            
            
        box_feats = torch.from_numpy(box_feats)
        box_coordsorig = box_coords
        box_coords_6d = self._process_boxes(box_coords,W,H)       
        box_coords_6d  = torch.from_numpy(box_coords_6d)
        
        #boxes in h4files are in x1 y1 x2 y2 format
        iou = getIOU(gtboxorig.unsqueeze(0),torch.from_numpy(box_coordsorig).float())
        correct = iou>0.5
        _,idx = torch.max(iou,dim=0)
#        print (iou,iou.shape,box_coordsorig,"index",idx)
        gtboxiou = box_coordsorig[idx]
        gtboxiou = torch.from_numpy(gtboxiou)
        
        tokens = tokenize_ques(self.dictionary,que)
        qfeat = torch.from_numpy(tokens).long()
        Lvec = torch.zeros(N).long()
        Lvec[:L] = 1        
        return sent_id,ans,box_feats,box_coordsorig,box_coords_6d.float(),\
                    gtboxorig.float(),qfeat,Lvec,idx,correct.view(-1)