def __getitem__(self, idx): if self.testmode: img, phrase = self.pull_item(idx) else: img, phrase, bbox = self.pull_item(idx) phrase = phrase.lower() if self.augment: augment_flip, augment_hsv, augment_affine = True, True, True ## seems a bug in torch transformation resize, so separate in advance h, w = img.shape[0], img.shape[1] if self.augment: ## random horizontal flip if augment_flip and random.random() > 0.5: img = cv2.flip(img, 1) bbox[0], bbox[2] = w - bbox[2] - 1, w - bbox[0] - 1 phrase = phrase.replace('right', '*&^special^&*').replace( 'left', 'right').replace('*&^special^&*', 'left') ## random intensity, saturation change if augment_hsv: fraction = 0.50 img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2HSV) S = img_hsv[:, :, 1].astype(np.float32) V = img_hsv[:, :, 2].astype(np.float32) a = (random.random() * 2 - 1) * fraction + 1 if a > 1: np.clip(S, a_min=0, a_max=255, out=S) a = (random.random() * 2 - 1) * fraction + 1 V *= a if a > 1: np.clip(V, a_min=0, a_max=255, out=V) img_hsv[:, :, 1] = S.astype(np.uint8) img_hsv[:, :, 2] = V.astype(np.uint8) img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR), cv2.COLOR_BGR2RGB) img, _, ratio, dw, dh = letterbox(img, None, self.imsize) bbox[0], bbox[2] = bbox[0] * ratio + dw, bbox[2] * ratio + dw bbox[1], bbox[3] = bbox[1] * ratio + dh, bbox[3] * ratio + dh ## random affine transformation if augment_affine: img, _, bbox, M = random_affine(img, None, bbox, \ degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10)) else: ## should be inference, or specified training img, _, ratio, dw, dh = letterbox(img, None, self.imsize) if self.testmode == False: bbox[0], bbox[2] = bbox[0] * ratio + dw, bbox[2] * ratio + dw bbox[1], bbox[3] = bbox[1] * ratio + dh, bbox[3] * ratio + dh ## Norm, to tensor if self.transform is not None: img = self.transform(img) phrase = self.tokenize_phrase(phrase) if self.testmode == False: object_map = Image.new("L", (img.size(1), img.size(2))) object_map_ = ImageDraw.Draw(object_map) # print(bbox.shape) # print(img.shape) bbox_ = list(map(int, list(bbox))) object_map_.rectangle(bbox_, fill="white") object_map_8 = (np.array( object_map.resize((img.size(1) // 8, img.size(2) // 8), Image.BILINEAR)) > 1).astype(int) object_map_16 = (np.array( object_map.resize((img.size(1) // 16, img.size(2) // 16), Image.BILINEAR)) > 1).astype(int) object_map_32 = (np.array( object_map.resize((img.size(1) // 32, img.size(2) // 32), Image.BILINEAR)) > 1).astype(int) if self.testmode: return img, np.array(phrase, dtype=int), \ np.array(ratio, dtype=np.float32), \ np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0] else: return img, object_map_32, object_map_16, object_map_8, np.array(phrase, dtype=int), \ np.array(bbox, dtype=np.float32)
def __getitem__(self, idx): img, phrase, bbox = self.pull_item(idx) # phrase = phrase.decode("utf-8").encode().lower() phrase = phrase.lower() if self.augment: augment_flip, augment_hsv, augment_affine = True, True, True ## seems a bug in torch transformation resize, so separate in advance h, w = img.shape[0], img.shape[1] if self.augment: ## random horizontal flip if augment_flip and random.random() > 0.5: img = cv2.flip(img, 1) bbox[0], bbox[2] = w - bbox[2] - 1, w - bbox[0] - 1 phrase = phrase.replace('right', '*&^special^&*').replace( 'left', 'right').replace('*&^special^&*', 'left') ## random intensity, saturation change if augment_hsv: fraction = 0.50 img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2HSV) S = img_hsv[:, :, 1].astype(np.float32) V = img_hsv[:, :, 2].astype(np.float32) a = (random.random() * 2 - 1) * fraction + 1 if a > 1: np.clip(S, a_min=0, a_max=255, out=S) a = (random.random() * 2 - 1) * fraction + 1 V *= a if a > 1: np.clip(V, a_min=0, a_max=255, out=V) img_hsv[:, :, 1] = S.astype(np.uint8) img_hsv[:, :, 2] = V.astype(np.uint8) img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR), cv2.COLOR_BGR2RGB) img, _, ratio, dw, dh = letterbox(img, None, self.imsize) bbox[0], bbox[2] = bbox[0] * ratio + dw, bbox[2] * ratio + dw bbox[1], bbox[3] = bbox[1] * ratio + dh, bbox[3] * ratio + dh ## random affine transformation if augment_affine: img, _, bbox, M = random_affine(img, None, bbox, \ degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10)) else: ## should be inference, or specified training img, _, ratio, dw, dh = letterbox(img, None, self.imsize) bbox[0], bbox[2] = bbox[0] * ratio + dw, bbox[2] * ratio + dw bbox[1], bbox[3] = bbox[1] * ratio + dh, bbox[3] * ratio + dh ## Norm, to tensor if self.transform is not None: img = self.transform(img) if self.lstm: phrase = self.tokenize_phrase(phrase) word_id = phrase # word_mask = np.zeros(word_id.shape) word_mask = np.array(word_id > 0, dtype=int) else: ## encode phrase to bert input examples = read_examples(phrase, idx) features = convert_examples_to_features(examples=examples, seq_length=self.query_len, tokenizer=self.tokenizer) word_id = features[0].input_ids word_mask = features[0].input_mask if self.testmode: return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \ np.array(bbox, dtype=np.float32), np.array(ratio, dtype=np.float32), \ np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0] else: return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \ np.array(bbox, dtype=np.float32)
def __getitem__(self, idx): img, phrase, bbox, mask, img_path, clust_label = self.pull_item(idx) # phrase = phrase.decode("utf-8").encode().lower() phrase = phrase.lower() center_gt = np.array(ndimage.measurements.center_of_mass(mask)[::-1], dtype=np.float32) mask_origin = mask image_origin = img if self.augment: augment_flip, augment_hsv, augment_affine = True, True, True ## seems a bug in torch transformation resize, so separate in advance h, w = img.shape[0], img.shape[1] if self.augment: ## random horizontal flip if augment_flip and random.random() > 0.5: img = cv2.flip(img, 1) mask = cv2.flip(mask, 1) bbox[0], bbox[2] = w - bbox[2] - 1, w - bbox[0] - 1 center_gt[0] = w - center_gt[0] - 1 phrase = phrase.replace('right', '*&^special^&*').replace( 'left', 'right').replace('*&^special^&*', 'left') ## random intensity, saturation change if augment_hsv: fraction = 0.50 img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2HSV) S = img_hsv[:, :, 1].astype(np.float32) V = img_hsv[:, :, 2].astype(np.float32) a = (random.random() * 2 - 1) * fraction + 1 if a > 1: np.clip(S, a_min=0, a_max=255, out=S) a = (random.random() * 2 - 1) * fraction + 1 V *= a if a > 1: np.clip(V, a_min=0, a_max=255, out=V) img_hsv[:, :, 1] = S.astype(np.uint8) img_hsv[:, :, 2] = V.astype(np.uint8) img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR), cv2.COLOR_BGR2RGB) img, mask, ratio, dw, dh = letterbox(img, mask, self.imsize) bbox[0], bbox[2] = bbox[0] * ratio + dw, bbox[2] * ratio + dw bbox[1], bbox[3] = bbox[1] * ratio + dh, bbox[3] * ratio + dh center_gt[0], center_gt[ 1] = center_gt[0] * ratio + dw, center_gt[1] * ratio + dh center = center_gt ## random affine transformation if augment_affine: img, mask, bbox, center, M = random_affine(img, mask, bbox, center, \ degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10)) else: ## should be inference, or specified training img, mask, ratio, dw, dh = letterbox(img, mask, self.imsize) bbox[0], bbox[2] = bbox[0] * ratio + dw, bbox[2] * ratio + dw bbox[1], bbox[3] = bbox[1] * ratio + dh, bbox[3] * ratio + dh center_gt[0], center_gt[ 1] = center_gt[0] * ratio + dw, center_gt[1] * ratio + dh center = center_gt if self.gaussian is not None: gap = np.random.normal(0, 1, 2) * self.gaussian center = center + gap center[0] = min(max(center[0], dw), self.imsize - dw) center[1] = min(max(center[1], dh), self.imsize - dh) # center[0] = (bbox[0]+bbox[2])/2 # center[1] = (bbox[1]+bbox[3])/2 # center_gt[0], center_gt[1] = center_gt[0] * ratio + dw, center_gt[1] *ratio+dh # center = center_gt# ndimage.measurements.center_of_mass(mask)[::-1] # (y,x) # bbox[0]=center[1] # bbox[1]=center[0] # center=center[::-1] ## Norm, to tensor if self.transform is not None: img = self.transform(img) if self.lstm: phrase = self.tokenize_phrase(phrase) word_id = phrase word_mask = np.zeros(word_id.shape) else: ## encode phrase to bert input examples = read_examples(phrase, idx) features = convert_examples_to_features(examples=examples, seq_length=self.query_len, tokenizer=self.tokenizer) word_id = features[0].input_ids word_mask = features[0].input_mask if self.testmode: return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \ np.array(bbox, dtype=np.float32), np.array(ratio, dtype=np.float32), \ np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0], mask,np.array(center,dtype=np.float32),phrase,mask_origin,image_origin else: return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \ np.array(bbox, dtype=np.float32), mask, np.array(center,dtype=np.float32), np.array(clust_label,dtype=np.float32)