def __init__(self, opt, split='train', seq_per_img=5): self.opt = opt self.batch_size = self.opt.batch_size self.seq_per_img = opt.seq_per_img self.seq_length = opt.seq_length self.split = split self.seq_per_img = seq_per_img # image processing function. if split == 'train': self.Resize = transforms.Resize( (self.opt.image_size, self.opt.image_size)) else: self.Resize = transforms.Resize( (self.opt.image_crop_size, self.opt.image_crop_size)) self.RandomCropWithBbox = utils.RandomCropWithBbox(opt.image_crop_size) self.ToTensor = transforms.ToTensor() self.res_Normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) self.vgg_pixel_mean = np.array([[[103.939, 116.779, 123.68]]]) self.max_gt_box = 100 self.max_proposal = 200 self.glove = vocab.GloVe(name='6B', dim=300) if opt.det_oracle == True: print('Training and Inference under oracle Mode...') # load the json file which contains additional information about the dataset print('DataLoader loading json file: ', opt.input_dic) self.info = json.load(open(self.opt.input_dic)) self.itow = self.info['ix_to_word'] self.wtoi = {w: i for i, w in self.itow.items()} self.wtod = {w: i + 1 for w, i in self.info['wtod'].items() } # word to detection self.dtoi = {w: i + 1 for i, w in enumerate(self.wtod.keys()) } # detection to index self.itod = {i + 1: w for i, w in enumerate(self.wtod.keys())} self.wtol = self.info['wtol'] self.ltow = {l: w for w, l in self.wtol.items()} self.vocab_size = len(self.itow) + 1 # since it start from 1 print('vocab size is ', self.vocab_size) # initilize the fg+s/p map back to word idx. self.st2towidx = np.zeros(len(self.dtoi) * 2 + 1) # statge 2 to word idex for w, i in self.dtoi.items(): s2_idx = i * 2 - 1 if w not in self.wtoi: w = 'UNK' w_idx = self.wtoi[w] self.st2towidx[s2_idx] = w_idx # get the plural idx. if w in self.ltow: pw = self.ltow[w] w_idx = self.wtoi[pw] self.st2towidx[s2_idx + 1] = w_idx # get the glove vector for the fg detections. self.glove_fg = np.zeros((len(self.dtoi) + 1, 300)) for i, word in enumerate(self.dtoi.keys()): vector = np.zeros((300)) count = 0 for w in word.split(' '): count += 1 if w in self.glove.stoi: glove_vector = self.glove.vectors[self.glove.stoi[w]] vector += glove_vector.numpy() else: # use a random vector instead random_vector = 2 * np.random.rand(300) - 1 vector += random_vector self.glove_fg[i + 1] = vector / count self.glove_w = np.zeros((len(self.wtoi) + 1, 300)) for i, word in enumerate(self.wtoi.keys()): vector = np.zeros((300)) count = 0 for w in word.split(' '): count += 1 if w in self.glove.stoi: glove_vector = self.glove.vectors[self.glove.stoi[w]] vector += glove_vector.numpy() else: # use a random vector instead random_vector = 2 * np.random.rand(300) - 1 vector += random_vector self.glove_w[i + 1] = vector / count # open the caption json file print('DataLoader loading json file: ', opt.input_json) self.caption_file = json.load(open(self.opt.input_json)) # open the detection json file. print('DataLoader loading proposal file: ', opt.proposal_h5) h5_proposal_file = h5py.File(self.opt.proposal_h5, 'r', driver='core') self.num_proposals = h5_proposal_file['dets_num'][:] # self.label_proposals = h5_proposal_file['dets_labels'][:] self.label_proposals = h5_proposal_file['dets_labels'][:] self.num_nms = h5_proposal_file['nms_num'][:] h5_proposal_file.close() # load the coco grounding truth bounding box. det_train_path = '%s/coco/annotations/instances_train2014.json' % ( opt.data_path) det_val_path = '%s/coco/annotations/instances_val2014.json' % ( opt.data_path) self.coco_train = COCO(det_train_path) self.coco_val = COCO(det_val_path) # category id to labels. +1 becuase 0 is the background label. self.ctol = {c: i + 1 for i, c in enumerate(self.coco_val.cats.keys())} self.itoc = { i + 1: c['name'] for i, c in enumerate(self.coco_val.cats.values()) } self.ctoi = {c: i for i, c in self.itoc.items()} self.glove_clss = np.zeros((len(self.itoc) + 1, 300)) for i, word in enumerate(self.itoc.values()): vector = np.zeros((300)) count = 0 # if we decode novel word, replace the word representation based on the dictionary. if opt.decode_noc and word in utils.noc_word_map: word = utils.noc_word_map[word] for w in word.split(' '): count += 1 if w in self.glove.stoi: glove_vector = self.glove.vectors[self.glove.stoi[w]] vector += glove_vector.numpy() else: # use a random vector instead random_vector = 2 * np.random.rand(300) - 1 vector += random_vector self.glove_clss[i + 1] = vector / count self.detect_size = len(self.ctol) self.fg_size = len(self.dtoi) # get the fine-grained mask. self.fg_mask = np.ones((self.detect_size + 1, self.fg_size + 1)) for w, det in self.wtod.items(): self.fg_mask[det, self.dtoi[w]] = 0 # separate out indexes for each of the provided splits self.split_ix = [] for ix in range(len(self.info['images'])): img = self.info['images'][ix] if img['split'] == split: self.split_ix.append(ix) print('assigned %d images to split %s' % (len(self.split_ix), split))
def __init__(self, opt, split='train', seq_per_img=5): self.opt = opt self.batch_size = self.opt.batch_size self.seq_per_img = opt.seq_per_img self.seq_length = opt.seq_length self.split = split self.seq_per_img = seq_per_img # image processing function. if split == 'train': self.Resize = transforms.Resize((self.opt.image_size, self.opt.image_size)) else: self.Resize = transforms.Resize((self.opt.image_crop_size, self.opt.image_crop_size)) self.RandomCropWithBbox = utils.RandomCropWithBbox(opt.image_crop_size) self.ToTensor = transforms.ToTensor() self.res_Normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) self.vgg_pixel_mean = np.array([[[102.9801, 115.9465, 122.7717]]]) self.max_gt_box = 100 self.max_proposal = 200 if opt.glove_6B_300: self.glove = vocab.GloVe(name='6B', dim=300) if opt.bert_base_768: self.bertmodel, self.bertvocab = Gnlp.model.get_model(name='bert_12_768_12',dataset_name='book_corpus_wiki_en_uncased',pretrained=True,ctx=mx.cpu(),use_pooler=False,use_decoder=False,use_classifier=False) self.bertembed = BertEmbedding(ctx=mx.cpu(), dtype='float32', model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', params_path=None, max_seq_length=25, batch_size=25) self.bertmodel = None if opt.det_oracle == True: print('Training and Inference under oracle Mode...') # load the json file which contains additional information about the dataset print('DataLoader loading json file: ', opt.input_dic) self.info = json.load(open(self.opt.input_dic)) self.itow = self.info['ix_to_word'] self.wtoi = {w:i for i,w in self.itow.items()} self.wtod = {w:i+1 for w,i in self.info['wtod'].items()} # word to detection self.dtoi = {w:i+1 for i,w in enumerate(self.wtod.keys())} # detection to index self.itod = {i+1:w for i,w in enumerate(self.wtod.keys())} self.wtol = self.info['wtol'] self.ltow = {l:w for w,l in self.wtol.items()} self.vocab_size = len(self.itow) + 1 # since it start from 1 print('vocab size is ', self.vocab_size) # initilize the fg+s/p map back to word idx. self.st2towidx = np.zeros(len(self.dtoi)*2+1) # statge 2 to word idex for w, i in self.dtoi.items(): s2_idx = i * 2 - 1 if w not in self.wtoi: w = 'UNK' w_idx = self.wtoi[w] self.st2towidx[s2_idx] = w_idx # get the plural idx. if w in self.ltow: pw = self.ltow[w] w_idx = self.wtoi[pw] self.st2towidx[s2_idx+1] = w_idx # get the glove vector for the fg detections. if opt.glove_6B_300: self.glove_fg = np.zeros((len(self.dtoi)+1, 300)) for i, word in enumerate(self.dtoi.keys()): vector = np.zeros((300)) count = 0 for w in word.split(' '): count += 1 if w in self.glove.stoi: glove_vector = self.glove.vectors[self.glove.stoi[w]] vector += glove_vector.numpy() else: # use a random vector instead random_vector = 2*np.random.rand(300) - 1 vector += random_vector self.glove_fg[i+1] = vector / count self.glove_w = np.zeros((len(self.wtoi)+1, 300)) for i, word in enumerate(self.wtoi.keys()): vector = np.zeros((300)) count = 0 for w in word.split(' '): count += 1 if w in self.glove.stoi: glove_vector = self.glove.vectors[self.glove.stoi[w]] vector += glove_vector.numpy() else: # use a random vector instead random_vector = 2*np.random.rand(300) - 1 vector += random_vector self.glove_w[i+1] = vector / count #if opt.bert_base_ctx: # self.bert_ctx_fg = np.zeros((len(self.dtoi)+1, 768)) # # get the bert vector for the fg detections. # for i, word in enumerate(self.dtoi.keys()): # vector = np.zeros((768)) # count = 0 # sent_bert = self.bertembed(word) # for w in word.split(' '): # count += 1 # if w in self.bertvocab.token_to_idx: # temp = self.bertembed(w) # bert_vec = temp[0][1][0] # vector += np.array(bert_vec) # else: # random_vector = 2*np.random.rand(768) - 1 # vector += random_vector # # self.bert_ctx_fg[i+1] = vector / count # self.bert_w = np.zeros((len(self.wtoi)+1, 768)) # for i, word in enumerate(self.wtoi.keys()): # vector = np.zeros((768)) # count = 0 # for w in word.split(' '): # count += 1 # if w in self.bertvocab.token_to_idx: # temp = self.bertembed(w) # bert_vec = temp[0][1][0] # vector += np.array(bert_vec) # else: # random_vector = 2*np.random.rand(768) - 1 # vector += random_vector # self.bert_ctx_w[i+1] = vector / count # get the bert vector for the fg detections. if opt.bert_base_768: self.bert_fg = np.zeros((len(self.dtoi)+1, 768)) for i, word in enumerate(self.dtoi.keys()): vector = np.zeros((768)) count = 0 for w in word.split(' '): #count += 1 if w in self.bertvocab.token_to_idx: temp = self.bertembed(w) bert_vec = temp[0][1][0] vector += np.array(bert_vec) #else: random_vector = 2*np.random.rand(768) - 1 vector += random_vector self.bert_fg[i+1] = vector self.bert_w = np.zeros((len(self.wtoi)+1, 768)) for i, word in enumerate(self.wtoi.keys()): vector = np.zeros((768)) count = 0 for w in word.split(' '): #count += 1 if w in self.bertvocab.token_to_idx: temp = self.bertembed(w) bert_vec = temp[0][1][0] vector += np.array(bert_vec) else: random_vector = 2*np.random.rand(768) - 1 vector += random_vector self.bert_w[i+1] = vector # open the caption json file print('DataLoader loading json file: ', opt.input_json) self.caption_file = json.load(open(self.opt.input_json)) # open the detection json file. self.dataloader_hdf = HDFSingleDataset(self.opt.proposal_h5) # load the coco grounding truth bounding box. det_train_path = '%s/coco/annotations/instances_train2014.json' %(opt.data_path) det_val_path = '%s/coco/annotations/instances_val2014.json' %(opt.data_path) self.coco_train = COCO(det_train_path) self.coco_val = COCO(det_val_path) # category id to labels. +1 becuase 0 is the background label. self.ctol = {c:i+1 for i, c in enumerate(self.coco_val.cats.keys())} self.itoc = {i+1:c['name'] for i, c in enumerate(self.coco_val.cats.values())} self.ctoi = {c:i for i, c in self.itoc.items()} if opt.bert_base_768: self.bert_clss = np.zeros((len(self.itoc)+1, 768)) for i, word in enumerate(self.itoc.values()): vector = np.zeros((768)) count = 0 # if we decode novel word, replace the word representation based on the dictionary. if opt.decode_noc and word in utils.noc_word_map: word = utils.noc_word_map[word] for w in word.split(' '): #count += 1 if w in self.bertvocab.token_to_idx: temp = self.bertembed(w) bert_vec = temp[0][1][0] vector += np.array(bert_vec) else: random_vector = 2*np.random.rand(768) - 1 vector += random_vector self.bert_clss[i+1] = vector if opt.glove_6B_300: self.glove_clss = np.zeros((len(self.itoc)+1, 300)) for i, word in enumerate(self.itoc.values()): vector = np.zeros((300)) count = 0 # if we decode novel word, replace the word representation based on the dictionary. if opt.decode_noc and word in utils.noc_word_map: word = utils.noc_word_map[word] for w in word.split(' '): count += 1 if w in self.glove.stoi: glove_vector = self.glove.vectors[self.glove.stoi[w]] vector += glove_vector.numpy() else: # use a random vector instead random_vector = 2*np.random.rand(300) - 1 vector += random_vector self.glove_clss[i+1] = vector / count self.detect_size = len(self.ctol) self.fg_size = len(self.dtoi) # get the fine-grained mask. self.fg_mask = np.ones((self.detect_size+1, self.fg_size+1)) for w, det in self.wtod.items(): self.fg_mask[det, self.dtoi[w]] = 0 # separate out indexes for each of the provided splits self.split_ix = [] for ix in range(len(self.info['images'])): img = self.info['images'][ix] if img['split'] == split: self.split_ix.append(ix) print('assigned %d images to split %s' %(len(self.split_ix), split))
def __init__(self, opt, split='train', seq_per_img=5): self.opt = opt self.batch_size = self.opt.batch_size self.seq_per_img = opt.seq_per_img self.seq_length = opt.seq_length self.split = split self.aug_gt_det = opt.aug_gt_det self.seq_per_img = seq_per_img self.att_feat_size = opt.att_feat_size self.vis_attn = opt.vis_attn self.feature_root = opt.feature_root # image processing function. if split == 'train': self.Resize = transforms.Resize( (self.opt.image_size, self.opt.image_size)) else: self.Resize = transforms.Resize( (self.opt.image_crop_size, self.opt.image_crop_size)) self.RandomCropWithBbox = utils.RandomCropWithBbox(opt.image_crop_size) self.ToTensor = transforms.ToTensor() self.res_Normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) self.vgg_pixel_mean = np.array([[[102.9801, 115.9465, 122.7717]]]) self.exclude_bgd_det = opt.exclude_bgd_det self.prop_thresh = opt.prop_thresh self.max_gt_box = 100 self.max_proposal = 200 self.glove = vocab.GloVe(name='6B', dim=300) # load the json file which contains additional information about the dataset print('DataLoader loading json file: ', opt.input_dic) self.info = json.load(open(self.opt.input_dic)) self.itow = self.info['ix_to_word'] self.wtoi = {w: i for i, w in self.itow.items()} self.wtod = {w: i + 1 for w, i in self.info['wtod'].items() } # word to detection self.dtoi = self.wtod # detection to index self.itod = {i: w for w, i in self.dtoi.items()} self.wtol = self.info['wtol'] self.ltow = {l: w for w, l in self.wtol.items()} self.vocab_size = len(self.itow) + 1 # since it start from 1 print('vocab size is ', self.vocab_size) self.itoc = self.itod # get the glove vector for the vg detection cls obj_cls_file = 'data/vg_object_vocab.txt' # From Peter's repo with open(obj_cls_file) as f: data = f.readlines() classes = ['__background__'] classes.extend([i.strip() for i in data]) # for VG classes self.vg_cls = classes self.glove_vg_cls = np.zeros((len(classes), 300)) for i, w in enumerate(classes): split_word = w.replace(',', ' ').split(' ') vector = [] for word in split_word: if word in self.glove.stoi: vector.append( self.glove.vectors[self.glove.stoi[word]].numpy()) else: # use a random vector instead vector.append(2 * np.random.rand(300) - 1) avg_vector = np.zeros((300)) for v in vector: avg_vector += v self.glove_vg_cls[i] = avg_vector / len(vector) # open the caption json file print('DataLoader loading json file: ', opt.input_json) self.caption_file = json.load(open(self.opt.input_json)) # open the detection json file. print('DataLoader loading proposal file: ', opt.proposal_h5) h5_proposal_file = h5py.File(self.opt.proposal_h5, 'r', driver='core') self.label_proposals = h5_proposal_file['dets_labels'][:] self.num_nms = h5_proposal_file['nms_num'][:] h5_proposal_file.close() # category id to labels. +1 becuase 0 is the background label. self.glove_clss = np.zeros((len(self.itod) + 1, 300)) self.glove_clss[0] = 2 * np.random.rand(300) - 1 # background for i, word in enumerate(self.itod.values()): if word in self.glove.stoi: vector = self.glove.vectors[self.glove.stoi[word]] else: # use a random vector instead vector = 2 * np.random.rand(300) - 1 self.glove_clss[i + 1] = vector if self.aug_gt_det: self.glove_vg_cls = np.concatenate( (self.glove_clss, self.glove_vg_cls), axis=0) self.glove_w = np.zeros((len(self.wtoi) + 1, 300)) for i, word in enumerate(self.wtoi.keys()): vector = np.zeros((300)) count = 0 for w in word.split(' '): count += 1 if w in self.glove.stoi: glove_vector = self.glove.vectors[self.glove.stoi[w]] vector += glove_vector.numpy() else: # use a random vector instead random_vector = 2 * np.random.rand(300) - 1 vector += random_vector self.glove_w[i + 1] = vector / count self.detect_size = len(self.itod) # separate out indexes for each of the provided splits self.split_ix = [] for ix in range(len(self.info['images'])): img = self.info['images'][ix] if img['split'] == split: if opt.vis_attn: if random.random( ) < 0.01: # randomly sample 1% segments to visualize self.split_ix.append(ix) else: self.split_ix.append(ix) print('assigned %d images to split %s' % (len(self.split_ix), split))