def __getitem_detector__(self, index): #print("Getting image bounding boxes using pre-trained detector") item_key = self.item_keys[index] item = self.items[item_key] sample = {} image_file_name_0 = item['image1'] image_file_name_1 = item['image2'] masks_0 = self.get_image_masks_by_training_index(index, 0) masks_1 = self.get_image_masks_by_training_index(index, 1) image_0, sample_0 = self.__getimage_detector__(image_file_name_0, masks_0) image_1, sample_1 = self.__getimage_detector__(image_file_name_1, masks_1) image = torch.stack((image_0, image_1), dim=0) sample["boxes_0"] = ArrayTensorField(sample_0["boxes"]) sample["boxes_1"] = ArrayTensorField(sample_1["boxes"]) sample["objects_0"] = sample_0["objects"] sample["objects_1"] = sample_1["objects"] if item.get("label", None) is not None: sample["next_image_label"] = np.array( [1 if item["label"] == True else 0]) else: sample["next_image_label"] = np.array([0]) sample["next_image_label"] = IntArrayField(sample["next_image_label"]) sample["is_random_next"] = sample["next_image_label"] return image, Instance(sample)
def __getimage__(self, image_file_path): sample = {} ################################################################### # Most of things adapted from VCR # Load image now and rescale it. Might have to subtract the mean and whatnot here too. if '.npz' in image_file_path: image_file_path = os.path.splitext(image_file_path)[0] image = load_image(image_file_path) image, window, img_scale, padding = resize_image( image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### # Consider the entire image as a whole detected box boxes = np.array([window]) obj_labels = [0] if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() """ if np.amax(boxes[:, 2]) >= w or np.amax(boxes[:, 3]) >= h: scale_w = (w - 1) / np.amax(boxes[:, 2]) scale_h = (h - 1) / np.amax(boxes[:, 3]) scale = min(scale_w, scale_h) boxes *= scale """ sample["objects"] = IntArrayField(np.array(obj_labels)) sample['boxes'] = torch.Tensor(boxes) assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) return image, sample
def __getitem__(self, index): if self.image_feature_type == "r2c": return self.__getitem_detector__(index) item = self.items[index] sample = {} if not self.text_only: image_feat_variable, image_boxes, image_dim_variable = self.get_image_features_by_training_index( index) image_feat_variable = ArrayField(image_feat_variable) image_dim_variable = IntArrayField(np.array(image_dim_variable)) sample["image_feat_variable"] = image_feat_variable sample["image_dim_variable"] = image_dim_variable sample["label"] = image_dim_variable else: sample["label"] = IntArrayField(np.array([0])) caption_a = item["caption"] imageID = item["image_id"] if self.expanded and index >= self.train_size: coco = self.coco_val else: coco = self.coco rest_anns = coco.loadAnns( [i for i in coco.getAnnIds(imgIds=imageID) if i != item['id']]) if self.args.get("two_sentence", True): if random.random() > 0.5: item_b = self.items[random.randint(0, len(self.items) - 1)] while item_b["image_id"] == imageID: item_b = self.items[random.randint(0, len(self.items) - 1)] flag = False else: item_b = rest_anns[random.randint(0, len(rest_anns) - 1)] flag = True caption_b = item_b["caption"] subword_tokens_a = self.tokenizer.tokenize(caption_a) subword_tokens_b = self.tokenizer.tokenize(caption_b) bert_example = InputExample(unique_id=index, text_a=subword_tokens_a, text_b=subword_tokens_b, is_correct=flag, max_seq_length=self.max_seq_length) elif not self.args.get("no_next_sentence", False): if random.random() < self.args.false_caption_ratio: item_b = self.items[random.randint(0, len(self.items) - 1)] while item_b["image_id"] == imageID: item_b = self.items[random.randint(0, len(self.items) - 1)] flag = False else: item_b = item flag = True caption_b = item_b["caption"] subword_tokens_b = self.tokenizer.tokenize(caption_b) bert_example = InputExample(unique_id=index, text_a=subword_tokens_b, text_b=None, is_correct=flag, max_seq_length=self.max_seq_length) else: caption_b = item["caption"] subword_tokens_b = self.tokenizer.tokenize(caption_b) bert_example = InputExample(unique_id=index, text_a=subword_tokens_b, text_b=None, is_correct=None, max_seq_length=self.max_seq_length) bert_feature = InputFeatures.convert_one_example_to_features_pretraining( example=bert_example, tokenizer=self.tokenizer, probability=self.masked_lm_prob) bert_feature.insert_field_into_dict(sample) return Instance(sample)
def __getitem__(self, index): entry = self.entries[index] sentence = entry['sentence'] e_pos = entry['entity_indices'] e_num = entry['entity_num'] target = entry['target_indices'] entity_ids = entry['entity_ids'] entity_types = entry['entity_types'] #v, b, p, e, n, a, idx, types if self.use_visual_genome: features = self.features[self.pos_boxes[entry['image']][0]:self.pos_boxes[entry['image']][1], :] spatials = self.spatials[self.pos_boxes[entry['image']][0]:self.pos_boxes[entry['image']][1], :] else: image_id = entry["image"] features, cls_boxes, max_conf, image_h, image_w = self.features_chunk[image_id] if self.add_spatial_features: features = np.concatenate((features, spatials), axis=1) else: spatials = None sample = {} image_feat_variable = ArrayField(features) image_dim_variable = IntArrayField(np.array(len(features))) sample["image_feat_variable"] = image_feat_variable sample["image_dim_variable"] = image_dim_variable tokenized_sentence, alignment = retokenize_with_alignment(sentence.split(" "), self.tokenizer) e_pos_after_subword = [] current_index = 0 for position in e_pos: for index, i in enumerate(alignment): if i == position: if index == len(alignment) - 1 or alignment[index+1] != i: e_pos_after_subword.append(index + 1) # Because the added [CTX] token if len(e_pos_after_subword) != len(e_pos) or len(e_pos_after_subword) != len(target): assert(0) # Need to convert target into soft scores: target_len = features.shape[0] new_target = [] for i in target: new_i = [0.0] * target_len if len(i) != 0: score = 1.0 / len(i) for j in i: new_i[j] = score new_target.append(new_i) # target = entity_num x v_feature_size target = ArrayField(np.array(new_target, dtype="float"), padding_value = 0.0) original_position = IntArrayField(np.array(e_pos_after_subword, dtype="int"), padding_value = -1) sample["label"] = target # Remember that sometimes that label is empty for certain entities, that's because the boxes we provided do not have a match. sample["flickr_position"] = original_position bert_example = InputExample(unique_id = -1, text_a = tokenized_sentence, text_b = None, is_correct = None, max_seq_length = self.max_seq_length) if self.pretraining: bert_feature = InputFeatures.convert_one_example_to_features_pretraining( example = bert_example, tokenizer=self.tokenizer, probability = self.masked_lm_prob) bert_feature.insert_field_into_dict(sample) else: bert_feature = InputFeatures.convert_one_example_to_features( example = bert_example, tokenizer=self.tokenizer) bert_feature.insert_field_into_dict(sample) return Instance(sample)
def __getitem__(self, index): iminfo = self.items[index] image_feat_variable, image_boxes, image_dim_variable = self.get_image_features_by_training_index( index) sample = {} image_feat_variable = ArrayField(image_feat_variable) image_dim_variable = IntArrayField(np.array(image_dim_variable)) sample["image_feat_variable"] = image_feat_variable sample["image_dim_variable"] = image_dim_variable answer = None valid_answers_idx = np.zeros((10), np.int32) valid_answers_idx.fill(-1) answer_scores = np.zeros(self.answer_dict.num_vocab, np.float32) if 'answer' in iminfo: answer = iminfo['answer'] elif 'valid_answers' in iminfo: valid_answers = iminfo['valid_answers'] answer = np.random.choice(valid_answers) valid_answers_idx[:len(valid_answers)] = ([ self.answer_dict.word2idx(ans) for ans in valid_answers ]) ans_idx = ([ self.answer_dict.word2idx(ans) for ans in valid_answers ]) answer_scores = (compute_answer_scores(ans_idx, self.answer_dict.num_vocab, self.answer_dict.UNK_idx)) if answer is not None: answer_idx = self.answer_dict.word2idx(answer) if self.advanced_vqa: new_answer = self.tokenized_list[self.answer_dict.word2idx(answer)] subword_tokens = self.tokenizer.tokenize(" ".join( iminfo['question_tokens'])) subword_tokens = ["[CLS]"] + subword_tokens + [ "?" ] # We will use the last word to do predictio masked_lm_labels = [-1] * len(subword_tokens) for i in new_answer: subword_tokens.append("[MASK]") masked_lm_labels.append(self.tokenizer.vocab[i]) subword_tokens.append("[SEP]") masked_lm_labels.append(-1) input_ids = [] for i in subword_tokens: input_ids.append(self.tokenizer.vocab[i]) bert_feature = InputFeatures(unique_id=-1, tokens=subword_tokens, input_ids=input_ids, input_mask=[1] * len(input_ids), input_type_ids=[0] * len(input_ids), is_correct=1, lm_label_ids=masked_lm_labels) bert_feature.insert_field_into_dict(sample) else: if self.pretraining: item = iminfo if self.no_next_sentence: answer = answer label = None subword_tokens_a = self.tokenizer.tokenize(" ".join( item['question_tokens'])) + ["?"] subword_tokens_b = self.tokenizer.tokenize( " ".join(answer)) bert_example = InputExample( unique_id=index, text_a=subword_tokens_a + subword_tokens_b, text_b=None, is_correct=None, max_seq_length=self.max_seq_length) bert_feature = InputFeatures.convert_one_example_to_features_pretraining( example=bert_example, tokenizer=self.tokenizer, probability=0.15) else: assert (0) # Should not use this part '''if random.random() > self.false_caption_ratio: answer = answer label = 1 else: while(True): wrong_answer = np.random.choice(self.answer_dict.word_list) if wrong_answer not in valid_answers: wrong_answer = answer label = 0 break subword_tokens_a = self.tokenizer.tokenize(" ".join(item['question_tokens'])) + ["?"] subword_tokens_b = self.tokenizer.tokenize(" ".join(answer)) bert_example = InputExample(unique_id = index, text_a = subword_tokens_a, text_b = subword_tokens_b, is_correct = label, max_seq_length = self.max_seq_length) bert_feature = InputFeatures.convert_one_example_to_features_pretraining( example = bert_example, tokenizer=self.tokenizer, probability = 0.15)''' bert_feature.insert_field_into_dict(sample) else: item = iminfo subword_tokens = self.tokenizer.tokenize(" ".join( item['question_tokens'])) if self.no_next_sentence: subword_tokens = subword_tokens + [ "?", "[MASK]" ] # We will use the last word to do predictio subwords_b = None else: subword_tokens = subword_tokens + ["?"] subwords_b = ["[MASK]"] bert_example = InputExample(unique_id=-1, text_a=subword_tokens, text_b=subwords_b, max_seq_length=self.max_seq_length) bert_feature = InputFeatures.convert_one_example_to_features( bert_example, tokenizer=self.tokenizer) bert_feature.insert_field_into_dict(sample) if answer is not None: sample['label'] = ArrayField(np.array(answer_scores)) return Instance(sample)
def __getimage_detector__(self, image_file_path, metadata): sample = {} ################################################################### # Most of things adapted from VCR # Load image now and rescale it. Might have to subtract the mean and whatnot here too. if '.npz' in image_file_path: image_file_path = os.path.splitext(image_file_path)[0] image = load_image(image_file_path) image, window, img_scale, padding = resize_image( image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### # We will use all detections dets2use = np.arange(len(metadata['cls_boxes'])) # [nobj, 14, 14] #segms = np.stack([make_mask(mask_size=14, box=metadata['cls_boxes'][i], # polygons_list=metadata['segms'][i]) for i in dets2use]) boxes = np.array(metadata['cls_boxes']) # Possibly rescale them if necessary boxes /= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] """ try: metadata['names'] = [i.split(" ")[1][1:-1] for i in metadata["names"]] except: pass obj_labels = [self.coco_obj_to_ind[metadata['names'][i]] for i in dets2use.tolist()] """ obj_labels = metadata['objects'] keep_boxes = np.where(obj_labels > 0) boxes = boxes[keep_boxes] obj_labels = [0] + list(obj_labels[keep_boxes]) obj_labels = [int(a) for a in obj_labels] boxes = np.row_stack((window, boxes)) #segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0) #sample['segms'] = ArrayField(segms, padding_value=0) #sample['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels]) sample["objects"] = IntArrayField(np.array(obj_labels)) if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() if np.amax(boxes[:, 2]) >= w or np.amax(boxes[:, 3]) >= h: scale_w = (w - 1) / np.amax(boxes[:, 2]) scale_h = (h - 1) / np.amax(boxes[:, 3]) scale = min(scale_w, scale_h) boxes *= scale #print(np.amax(boxes[:, 2]), w) #print(np.amax(boxes[:, 3]), h) assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) sample['boxes'] = torch.Tensor(boxes) return image, sample