def __getitem__(self, index): #tt = time.time() datafiles = self.files[index] name = datafiles["name"] image = Image.open(datafiles["img"]).convert('RGB') label = Image.open(datafiles["label"]) if self.scale: random_scale = 0.8 + random.random() * 0.4 # 0.8 - 1.2 image = image.resize((round(self.resize_size[0] * random_scale), round(self.resize_size[1] * random_scale)), Image.BICUBIC) label = label.resize((round(self.resize_size[0] * random_scale), round(self.resize_size[1] * random_scale)), Image.NEAREST) else: image = image.resize((self.resize_size[0], self.resize_size[1]), Image.BICUBIC) label = label.resize((self.resize_size[0], self.resize_size[1]), Image.NEAREST) label = np.asarray(label, np.uint8) # re-assign labels to match the format of Cityscapes label_copy = 255 * np.ones(label.shape, dtype=np.uint8) for k, v in list(self.id_to_trainid.items()): label_copy[label == k] = v if self.autoaug: policy = ImageNetPolicy() image = policy(image) image = np.asarray(image, np.float32) size = image.shape image = image[:, :, ::-1] # change to BGR image -= self.mean image = image.transpose((2, 0, 1)) if self.set == 'train': for i in range(10): #find hard samples x1 = random.randint(0, image.shape[1] - self.h) y1 = random.randint(0, image.shape[2] - self.w) tmp_label_copy = label_copy[x1:x1 + self.h, y1:y1 + self.w] tmp_image = image[:, x1:x1 + self.h, y1:y1 + self.w] u = np.unique(tmp_label_copy) if len(u) > 4: break else: print('RB: Too young too naive for %d times!' % i) else: x1 = random.randint(0, image.shape[1] - self.h) y1 = random.randint(0, image.shape[2] - self.w) tmp_image = image[:, x1:x1 + self.h, y1:y1 + self.w] tmp_label_copy = label_copy[x1:x1 + self.h, y1:y1 + self.w] image = tmp_image label_copy = tmp_label_copy if self.is_mirror and random.random() < 0.5: image = np.flip(image, axis=2) label_copy = np.flip(label_copy, axis=1) return image.copy(), label_copy.copy(), np.array(size), name
def __getitem__(self, index): #tt = time.time() datafiles = self.files[index] name = datafiles["name"] image, label = Image.open(datafiles["img"]).convert('RGB'), Image.open(datafiles["label"]) # resize image, label = image.resize(self.resize_size, Image.BICUBIC), label.resize(self.resize_size, Image.NEAREST) if self.autoaug: policy = ImageNetPolicy() image = policy(image) image, label = np.asarray(image, np.float32), np.asarray(label, np.uint8) ''' # re-assign labels to match the format of Cityscapes label_copy = 255 * np.ones(label.shape, dtype=np.uint8) for k, v in list(self.id_to_trainid.items()): label_copy[label == k] = v ''' size = image.shape image = image[:, :, ::-1] # change to BGR image -= self.mean image = image.transpose((2, 0, 1)) x1 = random.randint(0, image.shape[1] - self.h) y1 = random.randint(0, image.shape[2] - self.w) image = image[:, x1:x1+self.h, y1:y1+self.w] label = label[x1:x1+self.h, y1:y1+self.w] if self.is_mirror and random.random() < 0.5: image = np.flip(image, axis = 2) label = np.flip(label, axis = 1) return image.copy(), label.copy(), np.array(size), name
def transform123(image, target, split): """ Apply the transformations above. :param image: PIL Image :target: param boxes (n_instances, 4): bounding boxes in boundary coordinates, a tensor of dimensions param masks (n_instances, HxW size_of_image): masks of instances in a image. param labels(n_instances): labels of objects, a tensor of dimensions (n_objects) :(deprecated)param split: one of 'TRAIN' or 'TEST', since different sets of transformations are applied :return: transformed image, transformed bounding box coordinates, transformed labels, transformed difficulties """ # Mean and standard deviation of ImageNet data that our base VGG from torchvision was trained on # see: https://pytorch.org/docs/stable/torchvision/models.html mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] new_image = image # image: PIL image (H,W,C) new_targets = target new_boxes = target["boxes"] new_labels = target["labels"] new_masks = target["masks"] # A series of photometric distortions in random order, each with 50% chance of occurrence, as in Caffe repo if random.random() < 0.5: # new_image = photometric_distort(new_image) new_image = ImageNetPolicy()(new_image) # Flip image with a 50% chance if random.random() < 0.5: new_image, new_targets["boxes"], new_targets["masks"] = flip( new_image, new_boxes, new_masks ) ##### # Tensor Operation # image: PIL(H,W,C) to tensor (C,H,W), pixel value (0,1) ##### new_image = FT.to_tensor(new_image) # RandomRotate might drop boxes, masks, labels trhee together. At leat one left. if random.random() < 0.5: new_image, new_targets = RandomRotate( new_image, new_targets ) # rotate and fill borderValue as mean = [0.485, 0.456, 0.406] # Randomly crop image (zoom in) if random.random() < 0.5: new_image, new_targets = random_crop(new_image, new_targets) return new_image, new_targets
def __getitem__(self, index): img_path = self.files[index] image = Image.open(img_path).convert('RGB') # resize image = image.resize(self.resize_size, Image.BICUBIC) if self.autoaug: policy = ImageNetPolicy() image = policy(image) image = np.asarray(image, np.float32) size = image.shape image = image[:, :, ::-1] # change to BGR image -= self.mean image = image.transpose((2, 0, 1)) x1 = random.randint(0, image.shape[1] - self.h) y1 = random.randint(0, image.shape[2] - self.w) image = image[:, x1:x1 + self.h, y1:y1 + self.w] if self.is_mirror and random.random() < 0.5: image = np.flip(image, axis=2) return image.copy(), np.array(size), img_path
def __getitem__(self, index): datafiles = self.files[index] name = datafiles["name"] image = Image.open(datafiles["img"]).convert('RGB') #label = Image.open(datafiles["label"]).convert('RGB') #depth = Image.open(datafiles["depth"]) label = cv2.imread(datafiles["label"], -1)[:, :, -1] # resize if self.scale: random_scale = 0.8 + random.random() * 0.4 # 0.8 - 1.2 image = image.resize((round(self.resize_size[0] * random_scale), round(self.resize_size[1] * random_scale)), Image.BICUBIC) #label = label.resize( ( round(self.resize_size[0] * random_scale), round(self.resize_size[1] * random_scale)) , Image.NEAREST) #depth = depth.resize( ( round(self.resize_size[0] * random_scale), round(self.resize_size[1] * random_scale)) , Image.NEAREST) label = cv2.resize(label, (round(self.resize_size[0] * random_scale), round(self.resize_size[1] * random_scale)), interpolation=cv2.INTER_NEAREST) else: image = image.resize((self.resize_size[0], self.resize_size[1]), Image.BICUBIC) #label = label.resize( ( self.resize_size[0], self.resize_size[1] ) , Image.NEAREST) #depth = depth.resize( ( self.resize_size[0], self.resize_size[1] ) , Image.NEAREST) label = cv2.resize(label, (self.resize_size[0], self.resize_size[1]), interpolation=cv2.INTER_NEAREST) if self.autoaug: policy = ImageNetPolicy() image = policy(image) image = np.asarray(image, np.float32) #label = np.asarray(label, np.uint8)[:,:,2] #depth = np.asarray(depth, np.float32)[:,:,0] #depth = (65536.0 / (depth + 1.0)) # inverse depth #depth /= np.amax(depth) # re-assign labels to match the format of Cityscapes label_copy = np.ones(label.shape, dtype=np.uint8) for k, v in list(self.id_to_trainid.items()): label_copy[label == k] = v label_copy[label == self.class_index] = 0 size = image.shape image = image[:, :, ::-1] # change to BGR image -= self.mean image = image.transpose((2, 0, 1)) for i in range(10): #find hard samples x1 = random.randint(0, image.shape[1] - self.h) y1 = random.randint(0, image.shape[2] - self.w) tmp_image = image[:, x1:x1 + self.h, y1:y1 + self.w] tmp_label_copy = label_copy[x1:x1 + self.h, y1:y1 + self.w] #tmp_depth = depth[x1:x1+self.h, y1:y1+self.w] u = np.unique(tmp_label_copy) if len(u) > 10: break else: continue image = tmp_image label_copy = tmp_label_copy #depth = tmp_depth if self.is_mirror and random.random() < 0.5: image = np.flip(image, axis=2) label_copy = np.flip(label_copy, axis=1) #depth = np.flip(depth, axis = 1) return image.copy(), label_copy.copy(), np.array(size), name
def main(): parser = argparse.ArgumentParser(description="Train for VCDB Retrieval.") parser.add_argument('-lr', '--learning_rate', type=float, default=1e-4) parser.add_argument('-wd', '--weight_decay', type=float, default=0) parser.add_argument('-m', '--margin', type=float, default=0.3) parser.add_argument('-c', '--comment', type=str, default='') parser.add_argument('-e', '--epoch', type=int, default=50) parser.add_argument('-b', '--batch', type=int, default=64) parser.add_argument('-o', '--optim', type=str, default='sgd') args = parser.parse_args() margin = args.margin learning_rate = args.learning_rate weight_decay = args.weight_decay # 5e-5 ckpt = None vcdb_positives_path = 'sampling/data/vcdb_positive.csv' train_triplets_path = 'sampling/data/fivr_triplet_0810.csv' # 'sampling/fivr_triplet.csv' valid_triplets_path = 'sampling/data/vcdb_triplet_0806.csv' ckpt_dir = init_logger(args.comment) logger.info(args) logger.info(f'lr: {learning_rate}, margin: {margin}') logger.info( f'train_triplets_path: {train_triplets_path}, valid_triplets_path: {valid_triplets_path}' ) # Model embed_net = MobileNet_RMAC() net = TripletNet(embed_net).cuda() writer.add_graph(net, [ torch.rand((2, 3, 224, 224)).cuda(), torch.rand((2, 3, 224, 224)).cuda(), torch.rand((2, 3, 224, 224)).cuda() ]) logger.info(net) # logger.info(net.summary((3, 3, 224, 224))) if torch.cuda.device_count() > 1: net = nn.DataParallel(net) # for n,p in net.named_parameters(): # print(n, p.requires_grad) # Optimizer criterion = nn.TripletMarginLoss(margin) l2_dist = nn.PairwiseDistance() optimizer = optim.SGD(net.parameters(), lr=learning_rate, weight_decay=weight_decay, momentum=0.9) if args.optim == 'adam': optimizer = optim.Adam(net.parameters(), lr=learning_rate, weight_decay=weight_decay) # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 30, 50], gamma=0.1) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100], gamma=0.1) # Data transform = { 'train': trn.Compose([ # trn.RandomResizedCrop(224), # trn.RandomRotation(30), # trn.RandomHorizontalFlip(p=0.3), # trn.RandomVerticalFlip(p=0.1), trn.Resize((224, 224)), ImageNetPolicy(), trn.ToTensor(), trn.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]), 'valid': trn.Compose([ trn.Resize((224, 224)), trn.ToTensor(), trn.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]), } logger.info(transform) train_triplets = read_triplets(train_triplets_path) train_triplets_loader = DataLoader(TripletDataset( train_triplets, '/MLVD/FIVR/frames', transform=transform['train']), batch_size=64, shuffle=True, num_workers=4) valid_triplets = read_triplets(valid_triplets_path) valid_triplets_loader = DataLoader(TripletDataset( valid_triplets, '/MLVD/VCDB/frames', transform=transform['valid']), batch_size=64, shuffle=False, num_workers=4) vcdb_core = np.load('/MLVD/VCDB/meta/vcdb_core.pkl', allow_pickle=True) vcdb_positives = read_positive_csv(vcdb_positives_path) vcdb_annotation, vcdb_frame_annotation = scan_vcdb_annotation( '/MLVD/VCDB/annotation') vcdb_all_frames = np.array([ os.path.join('/MLVD/VCDB/frames', k, f) for k, frames in vcdb_core.items() for f in frames ]) vcdb_all_frames_loader = DataLoader(ListDataset( vcdb_all_frames, transform=transform['valid']), batch_size=128, shuffle=False, num_workers=4) # valid(net, valid_triplets_loader, criterion, l2_dist, 0) #positive_ranking2(net, vcdb_all_frames_loader, vcdb_frame_annotation, 0, 2, 1000) for e in range(1, args.epoch, 1): train(net, train_triplets_loader, optimizer, criterion, l2_dist, e) # valid(net, valid_triplets_loader, criterion, l2_dist, e) # positive_ranking(net, vcdb_all_frames_loader, vcdb_positives, e) positive_ranking2(net, vcdb_all_frames_loader, vcdb_frame_annotation, e, 2, 1000) scheduler.step() # print(f'[EPOCH {e}] {d}') torch.save( { 'epoch': e, 'model_state_dict': net.module.embedding_net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, f'{ckpt_dir}/epoch_{e}_ckpt.pth')