def __getitem__(self, index):
        #tt = time.time()
        datafiles = self.files[index]
        name = datafiles["name"]

        image = Image.open(datafiles["img"]).convert('RGB')
        label = Image.open(datafiles["label"])

        if self.scale:
            random_scale = 0.8 + random.random() * 0.4  # 0.8 - 1.2
            image = image.resize((round(self.resize_size[0] * random_scale),
                                  round(self.resize_size[1] * random_scale)),
                                 Image.BICUBIC)
            label = label.resize((round(self.resize_size[0] * random_scale),
                                  round(self.resize_size[1] * random_scale)),
                                 Image.NEAREST)
        else:
            image = image.resize((self.resize_size[0], self.resize_size[1]),
                                 Image.BICUBIC)
            label = label.resize((self.resize_size[0], self.resize_size[1]),
                                 Image.NEAREST)

        label = np.asarray(label, np.uint8)
        # re-assign labels to match the format of Cityscapes
        label_copy = 255 * np.ones(label.shape, dtype=np.uint8)
        for k, v in list(self.id_to_trainid.items()):
            label_copy[label == k] = v
        if self.autoaug:
            policy = ImageNetPolicy()
            image = policy(image)
        image = np.asarray(image, np.float32)
        size = image.shape
        image = image[:, :, ::-1]  # change to BGR
        image -= self.mean
        image = image.transpose((2, 0, 1))
        if self.set == 'train':
            for i in range(10):  #find hard samples
                x1 = random.randint(0, image.shape[1] - self.h)
                y1 = random.randint(0, image.shape[2] - self.w)
                tmp_label_copy = label_copy[x1:x1 + self.h, y1:y1 + self.w]
                tmp_image = image[:, x1:x1 + self.h, y1:y1 + self.w]
                u = np.unique(tmp_label_copy)
                if len(u) > 4:
                    break
                else:
                    print('RB: Too young too naive for %d times!' % i)
        else:
            x1 = random.randint(0, image.shape[1] - self.h)
            y1 = random.randint(0, image.shape[2] - self.w)
            tmp_image = image[:, x1:x1 + self.h, y1:y1 + self.w]
            tmp_label_copy = label_copy[x1:x1 + self.h, y1:y1 + self.w]

        image = tmp_image
        label_copy = tmp_label_copy

        if self.is_mirror and random.random() < 0.5:
            image = np.flip(image, axis=2)
            label_copy = np.flip(label_copy, axis=1)

        return image.copy(), label_copy.copy(), np.array(size), name
Esempio n. 2
0
    def __getitem__(self, index):
        #tt = time.time()
        datafiles = self.files[index]
        name = datafiles["name"]

        image, label = Image.open(datafiles["img"]).convert('RGB'), Image.open(datafiles["label"])
        # resize
        image, label = image.resize(self.resize_size, Image.BICUBIC), label.resize(self.resize_size, Image.NEAREST)
        if self.autoaug:
            policy = ImageNetPolicy()
            image = policy(image)

        image, label = np.asarray(image, np.float32), np.asarray(label, np.uint8)
        
        '''
        # re-assign labels to match the format of Cityscapes
        label_copy = 255 * np.ones(label.shape, dtype=np.uint8)
        for k, v in list(self.id_to_trainid.items()):
            label_copy[label == k] = v
        '''

        size = image.shape
        image = image[:, :, ::-1]  # change to BGR
        image -= self.mean
        image = image.transpose((2, 0, 1))
        x1 = random.randint(0, image.shape[1] - self.h)
        y1 = random.randint(0, image.shape[2] - self.w)
        image = image[:, x1:x1+self.h, y1:y1+self.w]
        label = label[x1:x1+self.h, y1:y1+self.w]

        if self.is_mirror and random.random() < 0.5:
            image = np.flip(image, axis = 2)
            label = np.flip(label, axis = 1)

        return image.copy(), label.copy(), np.array(size), name
Esempio n. 3
0
def transform123(image, target, split):
    """
    Apply the transformations above.
    :param image: PIL Image
    :target:
        param boxes (n_instances, 4): bounding boxes in boundary coordinates, a tensor of dimensions
        param masks (n_instances, HxW size_of_image): masks of instances in a image.
        param labels(n_instances): labels of objects, a tensor of dimensions (n_objects)
    :(deprecated)param split: one of 'TRAIN' or 'TEST', since different sets of transformations are applied
    :return: transformed image, transformed bounding box coordinates, transformed labels, transformed difficulties
    """

    # Mean and standard deviation of ImageNet data that our base VGG from torchvision was trained on
    # see: https://pytorch.org/docs/stable/torchvision/models.html
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]

    new_image = image  # image: PIL image (H,W,C)
    new_targets = target
    new_boxes = target["boxes"]
    new_labels = target["labels"]
    new_masks = target["masks"]

    # A series of photometric distortions in random order, each with 50% chance of occurrence, as in Caffe repo
    if random.random() < 0.5:
        #         new_image = photometric_distort(new_image)
        new_image = ImageNetPolicy()(new_image)

    # Flip image with a 50% chance
    if random.random() < 0.5:
        new_image, new_targets["boxes"], new_targets["masks"] = flip(
            new_image, new_boxes, new_masks
        )

    #####
    # Tensor Operation
    # image: PIL(H,W,C) to tensor (C,H,W),  pixel value (0,1)
    #####
    new_image = FT.to_tensor(new_image)

    # RandomRotate might drop boxes, masks, labels  trhee together. At leat one left.
    if random.random() < 0.5:
        new_image, new_targets = RandomRotate(
            new_image, new_targets
        )  # rotate and fill borderValue as mean = [0.485, 0.456, 0.406]

    # Randomly crop image (zoom in)
    if random.random() < 0.5:
        new_image, new_targets = random_crop(new_image, new_targets)

    return new_image, new_targets
Esempio n. 4
0
    def __getitem__(self, index):
        img_path = self.files[index]

        image = Image.open(img_path).convert('RGB')
        # resize
        image = image.resize(self.resize_size, Image.BICUBIC)
        if self.autoaug:
            policy = ImageNetPolicy()
            image = policy(image)

        image = np.asarray(image, np.float32)

        size = image.shape
        image = image[:, :, ::-1]  # change to BGR
        image -= self.mean
        image = image.transpose((2, 0, 1))
        x1 = random.randint(0, image.shape[1] - self.h)
        y1 = random.randint(0, image.shape[2] - self.w)
        image = image[:, x1:x1 + self.h, y1:y1 + self.w]

        if self.is_mirror and random.random() < 0.5:
            image = np.flip(image, axis=2)
        return image.copy(), np.array(size), img_path
Esempio n. 5
0
    def __getitem__(self, index):
        datafiles = self.files[index]

        name = datafiles["name"]
        image = Image.open(datafiles["img"]).convert('RGB')
        #label = Image.open(datafiles["label"]).convert('RGB')
        #depth = Image.open(datafiles["depth"])
        label = cv2.imread(datafiles["label"], -1)[:, :, -1]

        # resize
        if self.scale:
            random_scale = 0.8 + random.random() * 0.4  # 0.8 - 1.2
            image = image.resize((round(self.resize_size[0] * random_scale),
                                  round(self.resize_size[1] * random_scale)),
                                 Image.BICUBIC)
            #label = label.resize( ( round(self.resize_size[0] * random_scale), round(self.resize_size[1] * random_scale)) , Image.NEAREST)
            #depth = depth.resize( ( round(self.resize_size[0] * random_scale), round(self.resize_size[1] * random_scale)) , Image.NEAREST)
            label = cv2.resize(label,
                               (round(self.resize_size[0] * random_scale),
                                round(self.resize_size[1] * random_scale)),
                               interpolation=cv2.INTER_NEAREST)
        else:
            image = image.resize((self.resize_size[0], self.resize_size[1]),
                                 Image.BICUBIC)
            #label = label.resize( ( self.resize_size[0], self.resize_size[1] ) , Image.NEAREST)
            #depth = depth.resize( ( self.resize_size[0], self.resize_size[1] ) , Image.NEAREST)
            label = cv2.resize(label,
                               (self.resize_size[0], self.resize_size[1]),
                               interpolation=cv2.INTER_NEAREST)

        if self.autoaug:
            policy = ImageNetPolicy()
            image = policy(image)

        image = np.asarray(image, np.float32)
        #label = np.asarray(label, np.uint8)[:,:,2]
        #depth = np.asarray(depth, np.float32)[:,:,0]
        #depth = (65536.0 / (depth + 1.0)) # inverse depth
        #depth /= np.amax(depth)

        # re-assign labels to match the format of Cityscapes
        label_copy = np.ones(label.shape, dtype=np.uint8)
        for k, v in list(self.id_to_trainid.items()):
            label_copy[label == k] = v
        label_copy[label == self.class_index] = 0

        size = image.shape
        image = image[:, :, ::-1]  # change to BGR
        image -= self.mean
        image = image.transpose((2, 0, 1))

        for i in range(10):  #find hard samples
            x1 = random.randint(0, image.shape[1] - self.h)
            y1 = random.randint(0, image.shape[2] - self.w)
            tmp_image = image[:, x1:x1 + self.h, y1:y1 + self.w]
            tmp_label_copy = label_copy[x1:x1 + self.h, y1:y1 + self.w]
            #tmp_depth = depth[x1:x1+self.h, y1:y1+self.w]
            u = np.unique(tmp_label_copy)

            if len(u) > 10:
                break
            else:
                continue

        image = tmp_image
        label_copy = tmp_label_copy
        #depth = tmp_depth

        if self.is_mirror and random.random() < 0.5:
            image = np.flip(image, axis=2)
            label_copy = np.flip(label_copy, axis=1)
            #depth = np.flip(depth, axis = 1)

        return image.copy(), label_copy.copy(), np.array(size), name
Esempio n. 6
0
def main():
    parser = argparse.ArgumentParser(description="Train for VCDB Retrieval.")
    parser.add_argument('-lr', '--learning_rate', type=float, default=1e-4)
    parser.add_argument('-wd', '--weight_decay', type=float, default=0)
    parser.add_argument('-m', '--margin', type=float, default=0.3)
    parser.add_argument('-c', '--comment', type=str, default='')
    parser.add_argument('-e', '--epoch', type=int, default=50)
    parser.add_argument('-b', '--batch', type=int, default=64)
    parser.add_argument('-o', '--optim', type=str, default='sgd')
    args = parser.parse_args()

    margin = args.margin
    learning_rate = args.learning_rate
    weight_decay = args.weight_decay  # 5e-5
    ckpt = None

    vcdb_positives_path = 'sampling/data/vcdb_positive.csv'
    train_triplets_path = 'sampling/data/fivr_triplet_0810.csv'  # 'sampling/fivr_triplet.csv'
    valid_triplets_path = 'sampling/data/vcdb_triplet_0806.csv'

    ckpt_dir = init_logger(args.comment)
    logger.info(args)
    logger.info(f'lr: {learning_rate}, margin: {margin}')
    logger.info(
        f'train_triplets_path: {train_triplets_path}, valid_triplets_path: {valid_triplets_path}'
    )

    # Model
    embed_net = MobileNet_RMAC()
    net = TripletNet(embed_net).cuda()
    writer.add_graph(net, [
        torch.rand((2, 3, 224, 224)).cuda(),
        torch.rand((2, 3, 224, 224)).cuda(),
        torch.rand((2, 3, 224, 224)).cuda()
    ])
    logger.info(net)
    # logger.info(net.summary((3, 3, 224, 224)))
    if torch.cuda.device_count() > 1:
        net = nn.DataParallel(net)
    # for n,p in net.named_parameters():
    #     print(n, p.requires_grad)

    # Optimizer
    criterion = nn.TripletMarginLoss(margin)
    l2_dist = nn.PairwiseDistance()
    optimizer = optim.SGD(net.parameters(),
                          lr=learning_rate,
                          weight_decay=weight_decay,
                          momentum=0.9)
    if args.optim == 'adam':
        optimizer = optim.Adam(net.parameters(),
                               lr=learning_rate,
                               weight_decay=weight_decay)

    # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 30, 50], gamma=0.1)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                     milestones=[100],
                                                     gamma=0.1)
    # Data
    transform = {
        'train':
        trn.Compose([
            # trn.RandomResizedCrop(224),
            # trn.RandomRotation(30),
            # trn.RandomHorizontalFlip(p=0.3),
            # trn.RandomVerticalFlip(p=0.1),
            trn.Resize((224, 224)),
            ImageNetPolicy(),
            trn.ToTensor(),
            trn.Normalize(mean=[0.485, 0.456, 0.406],
                          std=[0.229, 0.224, 0.225])
        ]),
        'valid':
        trn.Compose([
            trn.Resize((224, 224)),
            trn.ToTensor(),
            trn.Normalize(mean=[0.485, 0.456, 0.406],
                          std=[0.229, 0.224, 0.225])
        ]),
    }
    logger.info(transform)
    train_triplets = read_triplets(train_triplets_path)
    train_triplets_loader = DataLoader(TripletDataset(
        train_triplets, '/MLVD/FIVR/frames', transform=transform['train']),
                                       batch_size=64,
                                       shuffle=True,
                                       num_workers=4)

    valid_triplets = read_triplets(valid_triplets_path)
    valid_triplets_loader = DataLoader(TripletDataset(
        valid_triplets, '/MLVD/VCDB/frames', transform=transform['valid']),
                                       batch_size=64,
                                       shuffle=False,
                                       num_workers=4)

    vcdb_core = np.load('/MLVD/VCDB/meta/vcdb_core.pkl', allow_pickle=True)
    vcdb_positives = read_positive_csv(vcdb_positives_path)
    vcdb_annotation, vcdb_frame_annotation = scan_vcdb_annotation(
        '/MLVD/VCDB/annotation')
    vcdb_all_frames = np.array([
        os.path.join('/MLVD/VCDB/frames', k, f)
        for k, frames in vcdb_core.items() for f in frames
    ])

    vcdb_all_frames_loader = DataLoader(ListDataset(
        vcdb_all_frames, transform=transform['valid']),
                                        batch_size=128,
                                        shuffle=False,
                                        num_workers=4)

    # valid(net, valid_triplets_loader, criterion, l2_dist, 0)
    #positive_ranking2(net, vcdb_all_frames_loader, vcdb_frame_annotation, 0, 2, 1000)

    for e in range(1, args.epoch, 1):
        train(net, train_triplets_loader, optimizer, criterion, l2_dist, e)
        # valid(net, valid_triplets_loader, criterion, l2_dist, e)
        # positive_ranking(net, vcdb_all_frames_loader, vcdb_positives, e)
        positive_ranking2(net, vcdb_all_frames_loader, vcdb_frame_annotation,
                          e, 2, 1000)
        scheduler.step()

        # print(f'[EPOCH {e}] {d}')
        torch.save(
            {
                'epoch': e,
                'model_state_dict': net.module.embedding_net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, f'{ckpt_dir}/epoch_{e}_ckpt.pth')