def __init__(self, opt, split='train', seq_per_img=5):
        self.opt = opt
        self.batch_size = self.opt.batch_size
        self.seq_per_img = opt.seq_per_img
        self.seq_length = opt.seq_length
        self.split = split
        self.seq_per_img = seq_per_img
        # image processing function.
        if split == 'train':
            self.Resize = transforms.Resize(
                (self.opt.image_size, self.opt.image_size))
        else:
            self.Resize = transforms.Resize(
                (self.opt.image_crop_size, self.opt.image_crop_size))

        self.RandomCropWithBbox = utils.RandomCropWithBbox(opt.image_crop_size)
        self.ToTensor = transforms.ToTensor()
        self.res_Normalize = transforms.Normalize([0.485, 0.456, 0.406],
                                                  [0.229, 0.224, 0.225])
        self.vgg_pixel_mean = np.array([[[103.939, 116.779, 123.68]]])

        self.max_gt_box = 100
        self.max_proposal = 200
        self.glove = vocab.GloVe(name='6B', dim=300)

        if opt.det_oracle == True:
            print('Training and Inference under oracle Mode...')

        # load the json file which contains additional information about the dataset
        print('DataLoader loading json file: ', opt.input_dic)
        self.info = json.load(open(self.opt.input_dic))
        self.itow = self.info['ix_to_word']
        self.wtoi = {w: i for i, w in self.itow.items()}
        self.wtod = {w: i + 1
                     for w, i in self.info['wtod'].items()
                     }  # word to detection
        self.dtoi = {w: i + 1
                     for i, w in enumerate(self.wtod.keys())
                     }  # detection to index
        self.itod = {i + 1: w for i, w in enumerate(self.wtod.keys())}
        self.wtol = self.info['wtol']
        self.ltow = {l: w for w, l in self.wtol.items()}
        self.vocab_size = len(self.itow) + 1  # since it start from 1
        print('vocab size is ', self.vocab_size)

        # initilize the fg+s/p map back to word idx.
        self.st2towidx = np.zeros(len(self.dtoi) * 2 +
                                  1)  # statge 2 to word idex
        for w, i in self.dtoi.items():
            s2_idx = i * 2 - 1
            if w not in self.wtoi:
                w = 'UNK'
            w_idx = self.wtoi[w]
            self.st2towidx[s2_idx] = w_idx
            # get the plural idx.
            if w in self.ltow:
                pw = self.ltow[w]
                w_idx = self.wtoi[pw]
            self.st2towidx[s2_idx + 1] = w_idx

        # get the glove vector for the fg detections.
        self.glove_fg = np.zeros((len(self.dtoi) + 1, 300))
        for i, word in enumerate(self.dtoi.keys()):
            vector = np.zeros((300))
            count = 0
            for w in word.split(' '):
                count += 1
                if w in self.glove.stoi:
                    glove_vector = self.glove.vectors[self.glove.stoi[w]]
                    vector += glove_vector.numpy()
                else:  # use a random vector instead
                    random_vector = 2 * np.random.rand(300) - 1
                    vector += random_vector
            self.glove_fg[i + 1] = vector / count

        self.glove_w = np.zeros((len(self.wtoi) + 1, 300))
        for i, word in enumerate(self.wtoi.keys()):
            vector = np.zeros((300))
            count = 0
            for w in word.split(' '):
                count += 1
                if w in self.glove.stoi:
                    glove_vector = self.glove.vectors[self.glove.stoi[w]]
                    vector += glove_vector.numpy()
                else:  # use a random vector instead
                    random_vector = 2 * np.random.rand(300) - 1
                    vector += random_vector
            self.glove_w[i + 1] = vector / count

        # open the caption json file
        print('DataLoader loading json file: ', opt.input_json)
        self.caption_file = json.load(open(self.opt.input_json))

        # open the detection json file.
        print('DataLoader loading proposal file: ', opt.proposal_h5)
        h5_proposal_file = h5py.File(self.opt.proposal_h5, 'r', driver='core')
        self.num_proposals = h5_proposal_file['dets_num'][:]
        # self.label_proposals = h5_proposal_file['dets_labels'][:]
        self.label_proposals = h5_proposal_file['dets_labels'][:]
        self.num_nms = h5_proposal_file['nms_num'][:]
        h5_proposal_file.close()

        # load the coco grounding truth bounding box.
        det_train_path = '%s/coco/annotations/instances_train2014.json' % (
            opt.data_path)
        det_val_path = '%s/coco/annotations/instances_val2014.json' % (
            opt.data_path)

        self.coco_train = COCO(det_train_path)
        self.coco_val = COCO(det_val_path)

        # category id to labels. +1 becuase 0 is the background label.
        self.ctol = {c: i + 1 for i, c in enumerate(self.coco_val.cats.keys())}
        self.itoc = {
            i + 1: c['name']
            for i, c in enumerate(self.coco_val.cats.values())
        }
        self.ctoi = {c: i for i, c in self.itoc.items()}

        self.glove_clss = np.zeros((len(self.itoc) + 1, 300))
        for i, word in enumerate(self.itoc.values()):
            vector = np.zeros((300))
            count = 0
            # if we decode novel word, replace the word representation based on the dictionary.
            if opt.decode_noc and word in utils.noc_word_map:
                word = utils.noc_word_map[word]

            for w in word.split(' '):
                count += 1
                if w in self.glove.stoi:
                    glove_vector = self.glove.vectors[self.glove.stoi[w]]
                    vector += glove_vector.numpy()
                else:  # use a random vector instead
                    random_vector = 2 * np.random.rand(300) - 1
                    vector += random_vector
            self.glove_clss[i + 1] = vector / count

        self.detect_size = len(self.ctol)
        self.fg_size = len(self.dtoi)
        # get the fine-grained mask.
        self.fg_mask = np.ones((self.detect_size + 1, self.fg_size + 1))
        for w, det in self.wtod.items():
            self.fg_mask[det, self.dtoi[w]] = 0

        # separate out indexes for each of the provided splits
        self.split_ix = []
        for ix in range(len(self.info['images'])):
            img = self.info['images'][ix]
            if img['split'] == split:
                self.split_ix.append(ix)
        print('assigned %d images to split %s' % (len(self.split_ix), split))
    def __init__(self, opt, split='train', seq_per_img=5):
        
        self.opt = opt
        self.batch_size = self.opt.batch_size
        self.seq_per_img = opt.seq_per_img
        self.seq_length = opt.seq_length
        self.split = split
        self.seq_per_img = seq_per_img
        
        # image processing function.
        if split == 'train':
            self.Resize = transforms.Resize((self.opt.image_size, self.opt.image_size))
        else:
            self.Resize = transforms.Resize((self.opt.image_crop_size, self.opt.image_crop_size))

        self.RandomCropWithBbox = utils.RandomCropWithBbox(opt.image_crop_size)
        self.ToTensor = transforms.ToTensor()
        self.res_Normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        self.vgg_pixel_mean = np.array([[[102.9801, 115.9465, 122.7717]]])

        self.max_gt_box = 100
        self.max_proposal = 200
        
        if opt.glove_6B_300:
            self.glove = vocab.GloVe(name='6B', dim=300)
            
        if opt.bert_base_768:
            self.bertmodel, self.bertvocab = Gnlp.model.get_model(name='bert_12_768_12',dataset_name='book_corpus_wiki_en_uncased',pretrained=True,ctx=mx.cpu(),use_pooler=False,use_decoder=False,use_classifier=False)
            self.bertembed = BertEmbedding(ctx=mx.cpu(), dtype='float32', model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', params_path=None, max_seq_length=25, batch_size=25)
            self.bertmodel = None

        if opt.det_oracle == True:
            print('Training and Inference under oracle Mode...')

        # load the json file which contains additional information about the dataset
        print('DataLoader loading json file: ', opt.input_dic)
        self.info = json.load(open(self.opt.input_dic))
        self.itow = self.info['ix_to_word']
        self.wtoi = {w:i for i,w in self.itow.items()}
        self.wtod = {w:i+1 for w,i in self.info['wtod'].items()} # word to detection
        self.dtoi = {w:i+1 for i,w in enumerate(self.wtod.keys())} # detection to index
        self.itod = {i+1:w for i,w in enumerate(self.wtod.keys())}
        self.wtol = self.info['wtol']
        self.ltow = {l:w for w,l in self.wtol.items()}
        self.vocab_size = len(self.itow) + 1 # since it start from 1
        print('vocab size is ', self.vocab_size)

        # initilize the fg+s/p map back to word idx.
        self.st2towidx = np.zeros(len(self.dtoi)*2+1) # statge 2 to word idex
        for w, i in self.dtoi.items():
            s2_idx = i * 2 - 1
            if w not in self.wtoi:
                w = 'UNK'
            w_idx = self.wtoi[w]
            self.st2towidx[s2_idx] = w_idx
            # get the plural idx.
            if w in self.ltow:
                pw = self.ltow[w]
                w_idx = self.wtoi[pw]
            self.st2towidx[s2_idx+1] = w_idx
        

        # get the glove vector for the fg detections.
        if opt.glove_6B_300:
            self.glove_fg = np.zeros((len(self.dtoi)+1, 300))
            for i, word in enumerate(self.dtoi.keys()):
                vector = np.zeros((300))
                count = 0
                for w in word.split(' '):
                    count += 1
                    if w in self.glove.stoi:
                        glove_vector = self.glove.vectors[self.glove.stoi[w]]
                        vector += glove_vector.numpy()
                    else: # use a random vector instead
                        random_vector = 2*np.random.rand(300) - 1
                        vector += random_vector
                self.glove_fg[i+1] = vector / count

            self.glove_w = np.zeros((len(self.wtoi)+1, 300))
            for i, word in enumerate(self.wtoi.keys()):
                vector = np.zeros((300))
                count = 0
                for w in word.split(' '):
                    count += 1
                    if w in self.glove.stoi:
                        glove_vector = self.glove.vectors[self.glove.stoi[w]]
                        vector += glove_vector.numpy()
                    else: # use a random vector instead
                        random_vector = 2*np.random.rand(300) - 1
                        vector += random_vector
                self.glove_w[i+1] = vector / count

        #if opt.bert_base_ctx:
        #    self.bert_ctx_fg = np.zeros((len(self.dtoi)+1, 768))
        #    # get the bert vector for the fg detections.
        
        #    for i, word in enumerate(self.dtoi.keys()):
        #        vector = np.zeros((768))
        #        count = 0
        #        sent_bert = self.bertembed(word)
        #        for w in word.split(' '):
        #            count += 1
        #            if w in self.bertvocab.token_to_idx:
        #                temp = self.bertembed(w)
        #                bert_vec = temp[0][1][0]
        #                vector += np.array(bert_vec)
        #            else:
        #                random_vector = 2*np.random.rand(768) - 1
        #                vector += random_vector
#
        #        self.bert_ctx_fg[i+1] = vector / count

        #    self.bert_w = np.zeros((len(self.wtoi)+1, 768))
        #    for i, word in enumerate(self.wtoi.keys()):
        #        vector = np.zeros((768))
        #        count = 0
        #        for w in word.split(' '):
        #            count += 1
        #            if w in self.bertvocab.token_to_idx:
        #                temp = self.bertembed(w)
        #                bert_vec = temp[0][1][0]
        #                vector += np.array(bert_vec)
        #            else:
        #                random_vector = 2*np.random.rand(768) - 1
        #                vector += random_vector
        #        self.bert_ctx_w[i+1] = vector / count

        # get the bert vector for the fg detections.
        if opt.bert_base_768:
            self.bert_fg = np.zeros((len(self.dtoi)+1, 768))
            for i, word in enumerate(self.dtoi.keys()):
                vector = np.zeros((768))
                count = 0
                for w in word.split(' '):
                    #count += 1
                    if w in self.bertvocab.token_to_idx:
                        temp = self.bertembed(w)
                        bert_vec = temp[0][1][0]
                        vector += np.array(bert_vec)
                    #else:
                        random_vector = 2*np.random.rand(768) - 1
                        vector += random_vector
                self.bert_fg[i+1] = vector

            self.bert_w = np.zeros((len(self.wtoi)+1, 768))
            for i, word in enumerate(self.wtoi.keys()):
                vector = np.zeros((768))
                count = 0
                for w in word.split(' '):
                    #count += 1
                    if w in self.bertvocab.token_to_idx:
                        temp = self.bertembed(w)
                        bert_vec = temp[0][1][0]
                        vector += np.array(bert_vec)
                    else:
                        random_vector = 2*np.random.rand(768) - 1
                        vector += random_vector
                self.bert_w[i+1] = vector

        # open the caption json file
        print('DataLoader loading json file: ', opt.input_json)
        self.caption_file = json.load(open(self.opt.input_json))

        # open the detection json file.
        self.dataloader_hdf = HDFSingleDataset(self.opt.proposal_h5)

        # load the coco grounding truth bounding box.
        det_train_path = '%s/coco/annotations/instances_train2014.json' %(opt.data_path)
        det_val_path = '%s/coco/annotations/instances_val2014.json' %(opt.data_path)

        self.coco_train = COCO(det_train_path)
        self.coco_val = COCO(det_val_path)

        # category id to labels. +1 becuase 0 is the background label.
        self.ctol = {c:i+1 for i, c in enumerate(self.coco_val.cats.keys())}
        self.itoc = {i+1:c['name'] for i, c in enumerate(self.coco_val.cats.values())}
        self.ctoi = {c:i for i, c in self.itoc.items()}



        if opt.bert_base_768:
            self.bert_clss = np.zeros((len(self.itoc)+1, 768))
            for i, word in enumerate(self.itoc.values()):
                vector = np.zeros((768))
                count = 0
                # if we decode novel word, replace the word representation based on the dictionary.
                if opt.decode_noc and word in utils.noc_word_map:
                    word = utils.noc_word_map[word]

                for w in word.split(' '):
                    #count += 1
                    if w in self.bertvocab.token_to_idx:
                        temp = self.bertembed(w)
                        bert_vec = temp[0][1][0]
                        vector += np.array(bert_vec)
                    else:
                        random_vector = 2*np.random.rand(768) - 1
                        vector += random_vector
                self.bert_clss[i+1] = vector



        if opt.glove_6B_300:
            self.glove_clss = np.zeros((len(self.itoc)+1, 300))
            for i, word in enumerate(self.itoc.values()):
                vector = np.zeros((300))
                count = 0
                # if we decode novel word, replace the word representation based on the dictionary.
                if opt.decode_noc and word in utils.noc_word_map:
                    word = utils.noc_word_map[word]

                for w in word.split(' '):
                    count += 1
                    if w in self.glove.stoi:
                        glove_vector = self.glove.vectors[self.glove.stoi[w]]
                        vector += glove_vector.numpy()
                    else: # use a random vector instead
                        random_vector = 2*np.random.rand(300) - 1
                        vector += random_vector
                self.glove_clss[i+1] = vector / count



        self.detect_size = len(self.ctol)
        self.fg_size = len(self.dtoi)
        # get the fine-grained mask.
        self.fg_mask = np.ones((self.detect_size+1, self.fg_size+1))
        for w, det in self.wtod.items():
            self.fg_mask[det, self.dtoi[w]] = 0

        # separate out indexes for each of the provided splits
        self.split_ix = []
        for ix in range(len(self.info['images'])):
            img = self.info['images'][ix]
            if img['split'] == split:
                self.split_ix.append(ix)
        print('assigned %d images to split %s' %(len(self.split_ix), split))
Beispiel #3
0
    def __init__(self, opt, split='train', seq_per_img=5):
        self.opt = opt
        self.batch_size = self.opt.batch_size
        self.seq_per_img = opt.seq_per_img
        self.seq_length = opt.seq_length
        self.split = split
        self.aug_gt_det = opt.aug_gt_det
        self.seq_per_img = seq_per_img
        self.att_feat_size = opt.att_feat_size
        self.vis_attn = opt.vis_attn
        self.feature_root = opt.feature_root
        # image processing function.
        if split == 'train':
            self.Resize = transforms.Resize(
                (self.opt.image_size, self.opt.image_size))
        else:
            self.Resize = transforms.Resize(
                (self.opt.image_crop_size, self.opt.image_crop_size))
        self.RandomCropWithBbox = utils.RandomCropWithBbox(opt.image_crop_size)
        self.ToTensor = transforms.ToTensor()
        self.res_Normalize = transforms.Normalize([0.485, 0.456, 0.406],
                                                  [0.229, 0.224, 0.225])
        self.vgg_pixel_mean = np.array([[[102.9801, 115.9465, 122.7717]]])
        self.exclude_bgd_det = opt.exclude_bgd_det
        self.prop_thresh = opt.prop_thresh

        self.max_gt_box = 100
        self.max_proposal = 200
        self.glove = vocab.GloVe(name='6B', dim=300)

        # load the json file which contains additional information about the dataset
        print('DataLoader loading json file: ', opt.input_dic)
        self.info = json.load(open(self.opt.input_dic))
        self.itow = self.info['ix_to_word']
        self.wtoi = {w: i for i, w in self.itow.items()}
        self.wtod = {w: i + 1
                     for w, i in self.info['wtod'].items()
                     }  # word to detection
        self.dtoi = self.wtod  # detection to index
        self.itod = {i: w for w, i in self.dtoi.items()}
        self.wtol = self.info['wtol']
        self.ltow = {l: w for w, l in self.wtol.items()}
        self.vocab_size = len(self.itow) + 1  # since it start from 1
        print('vocab size is ', self.vocab_size)
        self.itoc = self.itod

        # get the glove vector for the vg detection cls
        obj_cls_file = 'data/vg_object_vocab.txt'  # From Peter's repo
        with open(obj_cls_file) as f:
            data = f.readlines()
            classes = ['__background__']
            classes.extend([i.strip() for i in data])

        # for VG classes
        self.vg_cls = classes
        self.glove_vg_cls = np.zeros((len(classes), 300))
        for i, w in enumerate(classes):
            split_word = w.replace(',', ' ').split(' ')
            vector = []
            for word in split_word:
                if word in self.glove.stoi:
                    vector.append(
                        self.glove.vectors[self.glove.stoi[word]].numpy())
                else:  # use a random vector instead
                    vector.append(2 * np.random.rand(300) - 1)

            avg_vector = np.zeros((300))
            for v in vector:
                avg_vector += v

            self.glove_vg_cls[i] = avg_vector / len(vector)

        # open the caption json file
        print('DataLoader loading json file: ', opt.input_json)
        self.caption_file = json.load(open(self.opt.input_json))

        # open the detection json file.
        print('DataLoader loading proposal file: ', opt.proposal_h5)
        h5_proposal_file = h5py.File(self.opt.proposal_h5, 'r', driver='core')
        self.label_proposals = h5_proposal_file['dets_labels'][:]
        self.num_nms = h5_proposal_file['nms_num'][:]
        h5_proposal_file.close()

        # category id to labels. +1 becuase 0 is the background label.
        self.glove_clss = np.zeros((len(self.itod) + 1, 300))
        self.glove_clss[0] = 2 * np.random.rand(300) - 1  # background
        for i, word in enumerate(self.itod.values()):
            if word in self.glove.stoi:
                vector = self.glove.vectors[self.glove.stoi[word]]
            else:  # use a random vector instead
                vector = 2 * np.random.rand(300) - 1
            self.glove_clss[i + 1] = vector

        if self.aug_gt_det:
            self.glove_vg_cls = np.concatenate(
                (self.glove_clss, self.glove_vg_cls), axis=0)

        self.glove_w = np.zeros((len(self.wtoi) + 1, 300))
        for i, word in enumerate(self.wtoi.keys()):
            vector = np.zeros((300))
            count = 0
            for w in word.split(' '):
                count += 1
                if w in self.glove.stoi:
                    glove_vector = self.glove.vectors[self.glove.stoi[w]]
                    vector += glove_vector.numpy()
                else:  # use a random vector instead
                    random_vector = 2 * np.random.rand(300) - 1
                    vector += random_vector
            self.glove_w[i + 1] = vector / count

        self.detect_size = len(self.itod)

        # separate out indexes for each of the provided splits
        self.split_ix = []
        for ix in range(len(self.info['images'])):
            img = self.info['images'][ix]
            if img['split'] == split:
                if opt.vis_attn:
                    if random.random(
                    ) < 0.01:  # randomly sample 1% segments to visualize
                        self.split_ix.append(ix)
                else:
                    self.split_ix.append(ix)
        print('assigned %d images to split %s' % (len(self.split_ix), split))