Ejemplo n.º 1
0
def check_images(data_dir: str):
    '''Iterate through a directory of images and find corrupt images

    Args:
        data_dir: Path to the directory containing the images

    Returns:
        (healthy_images, corrupt_images)
    '''
    dataset = LightlyDataset(from_folder=data_dir)
    filenames = dataset.get_filenames()

    def _is_corrupt(filename):
        image = Image.open(
            os.path.join(data_dir, filename)
        )
        try:
            image.load()
        except IOError:
            return True
        else:
            return False

    mapped = concurrent.thread_map(
        _is_corrupt,
        filenames,
        chunksize=min(32, len(filenames))
    )
    healthy_images = [f for f, is_corrupt
                      in zip(filenames, mapped) if not is_corrupt]
    corrupt_images = [f for f, is_corrupt
                      in zip(filenames, mapped) if is_corrupt]
    return healthy_images, corrupt_images
Ejemplo n.º 2
0
    def test_create_lightly_dataset_from_folder(self):
        n_subfolders = 5
        n_samples_per_subfolder = 10
        n_tot_files = n_subfolders * n_samples_per_subfolder

        dataset_dir, folder_names, sample_names = self.create_dataset(
            n_subfolders, n_samples_per_subfolder)

        dataset = LightlyDataset(from_folder=dataset_dir)
        filenames = dataset.get_filenames()

        fnames = []
        for dir_name in folder_names:
            for fname in sample_names:
                fnames.append(os.path.join(dir_name, fname))

        self.assertEqual(len(filenames), n_tot_files)
        self.assertEqual(len(dataset), n_tot_files)
        self.assertListEqual(sorted(fnames), sorted(filenames))

        out_dir = tempfile.mkdtemp()
        dataset.dump(out_dir)
        self.assertEqual(
            sum(
                len(os.listdir(os.path.join(out_dir, subdir)))
                for subdir in os.listdir(out_dir)),
            len(dataset),
        )

        shutil.rmtree(dataset_dir)
        shutil.rmtree(out_dir)
Ejemplo n.º 3
0
    def test_create_lightly_dataset_from_folder_nosubdir(self):

        # create a dataset
        n_tot = 100
        dataset = torchvision.datasets.FakeData(size=n_tot,
                                                image_size=(3, 32, 32))

        tmp_dir = tempfile.mkdtemp()
        sample_names = [f'img_{i}.jpg' for i in range(n_tot)]
        for sample_idx in range(n_tot):

            data = dataset[sample_idx]
            path = os.path.join(tmp_dir, sample_names[sample_idx])
            data[0].save(path)

        # create lightly dataset
        dataset = LightlyDataset(from_folder=tmp_dir)
        filenames = dataset.get_filenames()

        # tests
        self.assertEqual(len(filenames), n_tot)
        self.assertEqual(len(dataset), n_tot)
        self.assertListEqual(sorted(sample_names), sorted(filenames))

        for i in range(n_tot):
            sample, target, fname = dataset[i]
Ejemplo n.º 4
0
    def test_filenames_dataset_with_subdir(self):
        tmp_dir, folder_names, sample_names = self.create_dataset()
        folder_name_to_target = {
            folder_name: i
            for i, folder_name in enumerate(folder_names)
        }
        all_filenames = [
            os.path.join(folder_name, sample_name)
            for folder_name in folder_names for sample_name in sample_names
        ]
        n_samples = int(len(all_filenames) / 2)
        for i in range(5):
            np.random.seed(i)
            filenames = np.random.choice(all_filenames,
                                         n_samples,
                                         replace=False)

            dataset = LightlyDataset(input_dir=tmp_dir, filenames=filenames)
            filenames_dataset = dataset.get_filenames()
            self.assertEqual(len(filenames_dataset), len(dataset))
            self.assertEqual(len(filenames_dataset), len(filenames))
            self.assertEqual(set(filenames_dataset), set(filenames))
            filenames_dataset = set(filenames_dataset)
            for image, target, filename in dataset:
                self.assertIsInstance(image, Image)
                folder_name = filename.split(sep=os.sep)[0]
                self.assertEqual(target, folder_name_to_target[folder_name])
                self.assertIsInstance(filename, str)
                assert filename in filenames_dataset
Ejemplo n.º 5
0
    def test_filenames_dataset_no_subdir(self):
        # create a dataset
        n_tot = 100
        dataset = torchvision.datasets.FakeData(size=n_tot,
                                                image_size=(3, 32, 32))

        tmp_dir = tempfile.mkdtemp()
        all_filenames = [f'img_{i}.jpg' for i in range(n_tot)]
        for sample_idx in range(n_tot):
            data = dataset[sample_idx]
            path = os.path.join(tmp_dir, all_filenames[sample_idx])
            data[0].save(path)

        n_samples = len(all_filenames) // 2
        for i in range(5):
            np.random.seed(i)
            filenames = np.random.choice(all_filenames,
                                         n_samples,
                                         replace=False)

            dataset = LightlyDataset(input_dir=tmp_dir, filenames=filenames)
            filenames_dataset = dataset.get_filenames()
            self.assertEqual(len(filenames_dataset), len(dataset))
            self.assertEqual(len(filenames_dataset), len(filenames))
            self.assertEqual(set(filenames_dataset), set(filenames))
            filenames_dataset = set(filenames_dataset)
            for image, target, filename in dataset:
                self.assertIsInstance(image, Image)
                self.assertEqual(target, 0)
                self.assertIsInstance(filename, str)
                self.assertIn(filename, filenames_dataset)
Ejemplo n.º 6
0
    def test_create_lightly_with_indices(self):
        n_subfolders = 5
        n_samples_per_subfolder = 10
        n_tot_files = n_subfolders * n_samples_per_subfolder

        indices = random.sample(range(n_tot_files), 20)

        dataset_dir, folder_names, sample_names = self.create_dataset(
            n_subfolders, n_samples_per_subfolder)

        dataset = LightlyDataset(from_folder=dataset_dir, indices=indices)
        self.assertEqual(len(dataset), 20)
        self.assertLess(len(dataset), n_tot_files)

        filenames = dataset.get_filenames()
        self.assertEqual(len(filenames), 20)

        fnames = []
        for dir_name in folder_names:
            for fname in sample_names:
                fnames.append(os.path.join(dir_name, fname))

        fnames = [fnames[i] for i in indices]

        self.assertListEqual(sorted(fnames), sorted(filenames))

        shutil.rmtree(dataset_dir)
Ejemplo n.º 7
0
    def test_transform_setter(self, dataset: LightlyDataset = None):

        if dataset is None:
            tmp_dir, _, _ = self.create_dataset()
            dataset = LightlyDataset(input_dir=tmp_dir)
        # the transform of both datasets should be None
        self.assertIsNone(dataset.transform)
        self.assertIsNone(dataset.dataset.transform)
        # use the setter
        dataset.transform = torchvision.transforms.ToTensor()
        # assert that the transform is set in the nested dataset
        self.assertIsNotNone(dataset.transform)
        self.assertIsNotNone(dataset.dataset.transform)
Ejemplo n.º 8
0
 def test_from_torch_dataset_with_transform(self):
     dataset_ = torchvision.datasets.FakeData(size=1,
                                              image_size=(3, 32, 32))
     dataset = LightlyDataset.from_torch_dataset(
         dataset_, transform=torchvision.transforms.ToTensor())
     self.assertIsNotNone(dataset.transform)
     self.assertIsNotNone(dataset.dataset.transform)
Ejemplo n.º 9
0
def _upload_cli(cfg, is_cli_call=True):
    input_dir = cfg['input_dir']
    if input_dir and is_cli_call:
        input_dir = fix_input_path(input_dir)

    path_to_embeddings = cfg['embeddings']
    if path_to_embeddings and is_cli_call:
        path_to_embeddings = fix_input_path(path_to_embeddings)

    dataset_id = cfg['dataset_id']
    token = cfg['token']
    new_dataset_name = cfg['new_dataset_name']

    cli_api_args_wrong = False
    if not token:
        print_as_warning('Please specify your access token.')
        cli_api_args_wrong = True

    dataset_id_ok = dataset_id and len(dataset_id) > 0
    new_dataset_name_ok = new_dataset_name and len(new_dataset_name) > 0
    if new_dataset_name_ok and not dataset_id_ok:
        api_workflow_client = ApiWorkflowClient(token=token)
        api_workflow_client.create_dataset(dataset_name=new_dataset_name)
    elif dataset_id_ok and not new_dataset_name_ok:
        api_workflow_client = ApiWorkflowClient(token=token, dataset_id=dataset_id)
    else:
        print_as_warning('Please specify either the dataset_id of an existing dataset or a new_dataset_name.')
        cli_api_args_wrong = True

    if cli_api_args_wrong:
        print_as_warning('For help, try: lightly-upload --help')
        return

    size = cfg['resize']
    if not isinstance(size, int):
        size = tuple(size)
    transform = None
    if isinstance(size, tuple) or size > 0:
        transform = torchvision.transforms.Resize(size)

    if input_dir:
        mode = cfg['upload']
        dataset = LightlyDataset(input_dir=input_dir, transform=transform)
        api_workflow_client.upload_dataset(
            input=dataset, mode=mode, max_workers=cfg['loader']['num_workers']
        )
        print(f"Finished the upload of the dataset.")

    if path_to_embeddings:
        name = cfg['embedding_name']
        print("Starting upload of embeddings.")
        api_workflow_client.upload_embeddings(
            path_to_embeddings_csv=path_to_embeddings, name=name
        )
        print("Finished upload of embeddings.")

    if new_dataset_name_ok:
        print(f'The dataset_id of the newly created dataset is '
              f'{bcolors.OKBLUE}{api_workflow_client.dataset_id}{bcolors.ENDC}')
Ejemplo n.º 10
0
    def test_create_lightly_dataset_from_torchvision(self):
        tmp_dir = tempfile.mkdtemp()

        for dataset_name in self.available_dataset_names:
            dataset = LightlyDataset(root=tmp_dir, name=dataset_name)
            self.assertIsNotNone(dataset)

        shutil.rmtree(tmp_dir)
Ejemplo n.º 11
0
    def test_create_lightly_dataset_from_folder_nosubdir(self):

        # create a dataset
        n_tot = 100
        tmp_dir, sample_names = self.create_dataset_no_subdir(n_tot)

        # create lightly dataset
        dataset = LightlyDataset(input_dir=tmp_dir)
        filenames = dataset.get_filenames()

        # tests
        self.assertEqual(len(filenames), n_tot)
        self.assertEqual(len(dataset), n_tot)
        self.assertListEqual(sorted(sample_names), sorted(filenames))

        for i in range(n_tot):
            sample, target, fname = dataset[i]
Ejemplo n.º 12
0
 def test_not_existing_torchvision_dataset(self):
     list_of_non_existing_names = [
         'a-random-dataset', 'cifar-100', 'googleset_ 200'
     ]
     tmp_dir = tempfile.mkdtemp()
     for dataset_name in list_of_non_existing_names:
         with self.assertRaises(ValueError):
             LightlyDataset(root=tmp_dir, name=dataset_name)
Ejemplo n.º 13
0
    def test_dataset_get_filenames(self):
        self.create_video_dataset()
        dataset = LightlyDataset(input_dir=self.input_dir)
        video_dataset = dataset.dataset

        # Get filenames using VideoDataset.get_filenames.
        video_dataset_filenames = video_dataset.get_filenames()

        # Get filenames using calls to VideoDataset.get_filename(index).
        # This removes the optimization introduced in VideoDatset.get_filenames.
        # Both methods should give the same result.
        get_filenames = VideoDataset.get_filenames
        del VideoDataset.get_filenames
        lightly_dataset_filenames = dataset.get_filenames()
        VideoDataset.get_filenames = get_filenames

        assert video_dataset_filenames == lightly_dataset_filenames
Ejemplo n.º 14
0
    def test_dataset_plain(self):
        tmp_dir, _ = self.create_dataset_no_subdir(100)

        with self.subTest("no read rights files"):
            for subdir, dirs, files in os.walk(tmp_dir):
                for filename in files:
                    filepath = os.path.join(tmp_dir, filename)
                    os.chmod(filepath, 0o000)
            dataset = LightlyDataset(input_dir=tmp_dir)
            self.assertGreater(len(dataset.get_filenames()), 0)
            with self.assertRaises(PermissionError):
                for _ in dataset:
                    pass

        with self.subTest("no read rights root"):
            os.chmod(tmp_dir, 0o000)
            with self.assertRaises(PermissionError):
                dataset = LightlyDataset(input_dir=tmp_dir)
Ejemplo n.º 15
0
def _crop_cli(cfg, is_cli_call=True):
    input_dir = cfg['input_dir']
    if input_dir and is_cli_call:
        input_dir = fix_input_path(input_dir)
    output_dir = cfg['output_dir']
    if output_dir and is_cli_call:
        output_dir = fix_input_path(output_dir)
    label_dir = cfg['label_dir']
    if label_dir and is_cli_call:
        label_dir = fix_input_path(label_dir)
    label_names_file = cfg['label_names_file']
    if label_names_file and len(label_names_file) > 0:
        if is_cli_call:
            label_names_file = fix_input_path(label_names_file)
        with open(label_names_file, 'r') as file:
            label_names_file_dict = yaml.full_load(file)
        class_names = label_names_file_dict['names']
    else:
        class_names = None

    dataset = LightlyDataset(input_dir)

    class_indices_list_list: List[List[int]] = []
    bounding_boxes_list_list: List[List[BoundingBox]] = []

    # YOLO-Specific
    for filename_image in dataset.get_filenames():
        filepath_image_base, image_extension = os.path.splitext(filename_image)
        filepath_label = os.path.join(label_dir, filename_image).replace(
            image_extension, '.txt')
        class_indices, bounding_boxes = read_yolo_label_file(
            filepath_label, float(cfg['crop_padding']))
        class_indices_list_list.append(class_indices)
        bounding_boxes_list_list.append(bounding_boxes)

    cropped_images_list_list = \
        crop_dataset_by_bounding_boxes_and_save(dataset, output_dir, bounding_boxes_list_list, class_indices_list_list, class_names)

    print(
        f'Cropped images are stored at: {bcolors.OKBLUE}{output_dir}{bcolors.ENDC}'
    )
    return cropped_images_list_list
Ejemplo n.º 16
0
    def test_video_dataset_no_read_rights(self):
        self.create_dataset()

        with self.subTest("no read rights files"):
            for subdir, dirs, files in os.walk(self.input_dir):
                for filename in files:
                    filepath = os.path.join(self.input_dir, filename)
                    os.chmod(filepath, 0o000)
            with self.assertRaises(PermissionError):
                dataset = LightlyDataset(self.input_dir)

        with self.subTest("no read rights subdirs"):
            for subdir, dirs, files in os.walk(self.input_dir):
                os.chmod(subdir, 0o000)
            with self.assertRaises(PermissionError):
                dataset = LightlyDataset(self.input_dir)

        with self.subTest("no read rights root"):
            os.chmod(self.input_dir, 0o000)
            with self.assertRaises(PermissionError):
                dataset = LightlyDataset(self.input_dir)
Ejemplo n.º 17
0
    def test_video_dataset(self):

        if not VIDEO_DATASET_AVAILABLE:
            tmp_dir = tempfile.mkdtemp()
            # simulate a video
            # the video dataset will check to see whether there exists a file
            # with a video extension, it's enough to fake a video file here
            path = os.path.join(tmp_dir, 'my_file.png')
            dataset = torchvision.datasets.FakeData(size=1,
                                                    image_size=(3, 32, 32))
            image, _ = dataset[0]
            image.save(path)
            os.rename(path, os.path.join(tmp_dir, 'my_file.avi'))
            with self.assertRaises(ImportError):
                dataset = LightlyDataset(from_folder=tmp_dir)

            warnings.warn(
                'Did not test video dataset because of missing requirements')
            shutil.rmtree(tmp_dir)
            return

        self.create_video_dataset()
        dataset = LightlyDataset(from_folder=self.input_dir)

        out_dir = tempfile.mkdtemp()
        dataset.dump(out_dir)
        self.assertEqual(len(os.listdir(out_dir)), len(dataset))
Ejemplo n.º 18
0
def upload_images_from_folder(path_to_folder: str,
                              dataset_id: str,
                              token: str,
                              max_workers: int = 8,
                              mode: str = 'thumbnails',
                              size: int = -1,
                              verbose: bool = True):
    """Uploads images from a directory to the Lightly cloud solution.

    Args:
        path_to_folder:
            Path to the folder which holds the input images.
        dataset_id:
            The unique identifier for the dataset.
        token:
            Token for authentication.
        max_workers:
            Maximum number of workers uploading images in parallel.
        max_requests:
            Maximum number of requests a single worker can do before he has
            to wait for the others.
        mode:
            One of [full, thumbnails, metadata]. Whether to upload thumbnails,
            full images, or metadata only.
        size:
            Desired output size. If negative, default output size is used.
            If size is a sequence like (h, w), output size will be matched to 
            this. If size is an int, smaller edge of the image will be matched 
            to this number. i.e, if height > width, then image will be rescaled
            to (size * height / width, size).

    Raises:
        ValueError if dataset is too large.
        RuntimeError if the connection to the server failed.
        RuntimeError if dataset already has an initial tag.

    """

    transform = None
    if isinstance(size, tuple) or size > 0:
        transform = torchvision.transforms.Resize(size)

    dataset = LightlyDataset(input_dir=path_to_folder, transform=transform)
    upload_dataset(
        dataset,
        dataset_id,
        token,
        max_workers=max_workers,
        mode=mode,
        verbose=verbose,
    )
Ejemplo n.º 19
0
 def test_crop_dataset_by_bounding_boxes_and_save(self):
     dataset = LightlyDataset(self.cfg.input_dir)
     output_dir = self.cfg.output_dir
     no_files = len(dataset.get_filenames())
     bounding_boxes_list_list = [[BoundingBox(0, 0, 1, 1)]] * no_files
     class_indices_list_list = [[1]] * no_files
     class_names = ["class_0", "class_1"]
     with self.subTest("all_correct"):
         crop_dataset_by_bounding_boxes_and_save(dataset, output_dir,
                                                 bounding_boxes_list_list,
                                                 class_indices_list_list,
                                                 class_names)
     with self.subTest("wrong length of bounding_boxes_list_list"):
         with self.assertRaises(ValueError):
             crop_dataset_by_bounding_boxes_and_save(
                 dataset, output_dir, bounding_boxes_list_list[:-1],
                 class_indices_list_list, class_names)
     with self.subTest("wrong internal length of class_indices_list_list"):
         with self.assertWarns(UserWarning):
             class_indices_list_list[0] *= 2
             crop_dataset_by_bounding_boxes_and_save(
                 dataset, output_dir, bounding_boxes_list_list,
                 class_indices_list_list, class_names)
Ejemplo n.º 20
0
    def test_video_dataset_unavailable(self):
        tmp_dir = tempfile.mkdtemp()
        # simulate a video
        # the video dataset will check to see whether there exists a file
        # with a video extension, it's enough to fake a video file here
        path = os.path.join(tmp_dir, 'my_file.png')
        dataset = torchvision.datasets.FakeData(size=1, image_size=(3, 32, 32))
        image, _ = dataset[0]
        image.save(path)
        os.rename(path, os.path.join(tmp_dir, 'my_file.avi'))
        with self.assertRaises(ImportError):
            dataset = LightlyDataset(input_dir=tmp_dir)

        shutil.rmtree(tmp_dir)
        return
Ejemplo n.º 21
0
    def test_video_dataset_filenames(self):
        self.create_video_dataset()
        all_filenames = self.filenames

        def filename_img_fits_video(filename_img: str):
            for filename_video in all_filenames:
                filename_video = filename_video[:-1 * len('.avi')]
                if filename_video in filename_img:
                    return True
            return False

        n_samples = int(len(all_filenames) / 2)
        np.random.seed(42)
        filenames = np.random.choice(all_filenames, n_samples, replace=False)

        dataset = LightlyDataset(input_dir=self.input_dir, filenames=filenames)

        filenames_dataset = dataset.get_filenames()
        for image, target, filename in dataset:
            self.assertIsInstance(image, Image)
            self.assertTrue(filename_img_fits_video(filename))

            self.assertIsInstance(filename, str)
            self.assertIn(filename, filenames_dataset)
Ejemplo n.º 22
0
    def test_video_dataset_available(self):
        self.create_video_dataset()
        dataset = LightlyDataset(input_dir=self.input_dir)

        out_dir = tempfile.mkdtemp()
        dataset.dump(out_dir, dataset.get_filenames()[(len(dataset) // 2):])
        self.assertEqual(len(os.listdir(out_dir)), len(dataset) // 2)
        for filename in os.listdir(out_dir):
            self.assertIn(filename,
                          dataset.get_filenames()[(len(dataset) // 2):])
Ejemplo n.º 23
0
def _upload_cli(cfg, is_cli_call=True):
    input_dir = cfg['input_dir']
    if input_dir and is_cli_call:
        input_dir = fix_input_path(input_dir)

    path_to_embeddings = cfg['embeddings']
    if path_to_embeddings and is_cli_call:
        path_to_embeddings = fix_input_path(path_to_embeddings)

    dataset_id = cfg['dataset_id']
    token = cfg['token']
    new_dataset_name = cfg['new_dataset_name']

    if not token:
        warnings.warn('Please specify your access token. For help, try: lightly-upload --help')
        return

    dataset_id_ok = dataset_id and len(dataset_id) > 0
    new_dataset_name_ok = new_dataset_name and len(new_dataset_name) > 0
    if new_dataset_name_ok and not dataset_id_ok:
        api_workflow_client = ApiWorkflowClient(token=token)
        api_workflow_client.create_dataset(dataset_name=new_dataset_name)
    elif dataset_id_ok and not new_dataset_name_ok:
        api_workflow_client = ApiWorkflowClient(token=token, dataset_id=dataset_id)
    else:
        warnings.warn('Please specify either the dataset_id of an existing dataset or a new_dataset_name. '
                      'For help, try: lightly-upload --help')
        return

    size = cfg['resize']
    if not isinstance(size, int):
        size = tuple(size)
    transform = None
    if isinstance(size, tuple) or size > 0:
        transform = torchvision.transforms.Resize(size)

    if input_dir:
        mode = cfg['upload']
        dataset = LightlyDataset(input_dir=input_dir, transform=transform)
        api_workflow_client.upload_dataset(
            input=dataset, mode=mode, max_workers=cfg['loader']['num_workers']
        )

    if path_to_embeddings:
        name = cfg['embedding_name']
        api_workflow_client.upload_embeddings(
            path_to_embeddings_csv=path_to_embeddings, name=name
        )
Ejemplo n.º 24
0
def _embed_cli(cfg, is_cli_call=True):
    input_dir = cfg['input_dir']
    if input_dir and is_cli_call:
        input_dir = fix_input_path(input_dir)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    transform = torchvision.transforms.Compose([
        torchvision.transforms.Resize(
            (cfg['collate']['input_size'], cfg['collate']['input_size'])),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225]),
    ])

    dataset = LightlyDataset(input_dir, transform=transform)

    # disable drop_last and shuffle
    cfg['loader']['drop_last'] = False
    cfg['loader']['shuffle'] = False
    cfg['loader']['batch_size'] = min(cfg['loader']['batch_size'],
                                      len(dataset))

    # determine the number of available cores
    if cfg['loader']['num_workers'] < 0:
        cfg['loader']['num_workers'] = cpu_count()

    dataloader = torch.utils.data.DataLoader(dataset, **cfg['loader'])

    encoder = get_model_from_config(cfg, is_cli_call)

    embeddings, labels, filenames = encoder.embed(dataloader, device=device)

    if is_cli_call:
        path = os.path.join(os.getcwd(), 'embeddings.csv')
        save_embeddings(path, embeddings, labels, filenames)
        print(f'Embeddings are stored at {bcolors.OKBLUE}{path}{bcolors.ENDC}')
        os.environ[cfg['environment_variable_names']
                   ['lightly_last_embedding_path']] = path
        return path

    return embeddings, labels, filenames
Ejemplo n.º 25
0
    def test_create_lightly_dataset_with_invalid_char_in_filename(self):

        # create a dataset
        n_tot = 100
        dataset = torchvision.datasets.FakeData(size=n_tot,
                                                image_size=(3, 32, 32))

        for invalid_char in INVALID_FILENAME_CHARACTERS:
            with self.subTest(msg=f"invalid_char: {invalid_char}"):
                tmp_dir = tempfile.mkdtemp()
                sample_names = [f'img_,_{i}.jpg' for i in range(n_tot)]
                for sample_idx in range(n_tot):
                    data = dataset[sample_idx]
                    path = os.path.join(tmp_dir, sample_names[sample_idx])
                    data[0].save(path)

                    # create lightly dataset
                    with self.assertRaises(ValueError):
                        dataset = LightlyDataset(input_dir=tmp_dir)
Ejemplo n.º 26
0
def _upload_cli(cfg, is_cli_call=True):

    input_dir = cfg['input_dir']
    if input_dir and is_cli_call:
        input_dir = fix_input_path(input_dir)

    path_to_embeddings = cfg['embeddings']
    if path_to_embeddings and is_cli_call:
        path_to_embeddings = fix_input_path(path_to_embeddings)

    dataset_id = cfg['dataset_id']
    token = cfg['token']

    size = cfg['resize']
    if not isinstance(size, int):
        size = tuple(size)
    transform = None
    if isinstance(size, tuple) or size > 0:
        transform = torchvision.transforms.Resize(size)

    if not token or not dataset_id:
        print('Please specify your access token and dataset id.')
        print('For help, try: lightly-upload --help')
        return

    api_workflow_client = ApiWorkflowClient(token=token, dataset_id=dataset_id)

    if input_dir:
        mode = cfg['upload']
        dataset = LightlyDataset(input_dir=input_dir, transform=transform)
        api_workflow_client.upload_dataset(input=dataset, mode=mode)

    if path_to_embeddings:
        name = cfg['embedding_name']
        api_workflow_client.upload_embeddings(
            path_to_embeddings_csv=path_to_embeddings, name=name)
Ejemplo n.º 27
0
                              student_out,
                              epoch=self.current_epoch)
        return loss

    def configure_optimizers(self):
        optim = torch.optim.Adam(self.parameters(), lr=0.001)
        return optim


model = DINO()

# we ignore object detection annotations by setting target_transform to return 0
pascal_voc = torchvision.datasets.VOCDetection("datasets/pascal_voc",
                                               download=True,
                                               target_transform=lambda t: 0)
dataset = LightlyDataset.from_torch_dataset(pascal_voc)
# or create a dataset from a folder containing images or videos:
# dataset = LightlyDataset("path/to/folder")

collate_fn = DINOCollateFunction()

dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=64,
    collate_fn=collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=8,
)

gpus = torch.cuda.device_count()
Ejemplo n.º 28
0
def _upload_cli(cfg, is_cli_call=True):
    input_dir = cfg['input_dir']
    if input_dir and is_cli_call:
        input_dir = fix_input_path(input_dir)

    path_to_embeddings = cfg['embeddings']
    if path_to_embeddings and is_cli_call:
        path_to_embeddings = fix_input_path(path_to_embeddings)

    dataset_id = cfg['dataset_id']
    token = cfg['token']
    new_dataset_name = cfg['new_dataset_name']

    cli_api_args_wrong = False
    if not token:
        print_as_warning('Please specify your access token.')
        cli_api_args_wrong = True

    if dataset_id:
        if new_dataset_name:
            print_as_warning(
                'Please specify either the dataset_id of an existing dataset '
                'or a new_dataset_name, but not both.')
            cli_api_args_wrong = True
        else:
            api_workflow_client = \
                ApiWorkflowClient(token=token, dataset_id=dataset_id)
    else:
        if new_dataset_name:
            api_workflow_client = ApiWorkflowClient(token=token)
            api_workflow_client.create_dataset(dataset_name=new_dataset_name)
        else:
            print_as_warning(
                'Please specify either the dataset_id of an existing dataset '
                'or a new_dataset_name.')
            cli_api_args_wrong = True
    # delete the dataset_id as it might be an empty string
    # Use api_workflow_client.dataset_id instead
    del dataset_id

    if cli_api_args_wrong:
        print_as_warning('For help, try: lightly-upload --help')
        return

    # potentially load custom metadata
    custom_metadata = None
    if cfg['custom_metadata']:
        path_to_custom_metadata = fix_input_path(cfg['custom_metadata'])
        print('Loading custom metadata from '
              f'{bcolors.OKBLUE}{path_to_custom_metadata}{bcolors.ENDC}')
        with open(path_to_custom_metadata, 'r') as f:
            custom_metadata = json.load(f)

    # set the number of workers if unset
    if cfg['loader']['num_workers'] < 0:
        # set the number of workers to the number of CPUs available,
        # but minimum of 8
        num_workers = max(8, cpu_count())
        num_workers = min(32, num_workers)
        cfg['loader']['num_workers'] = num_workers

    size = cfg['resize']
    if not isinstance(size, int):
        size = tuple(size)
    transform = None
    if isinstance(size, tuple) or size > 0:
        transform = torchvision.transforms.Resize(size)

    if input_dir:
        mode = cfg['upload']
        dataset = LightlyDataset(input_dir=input_dir, transform=transform)
        api_workflow_client.upload_dataset(
            input=dataset,
            mode=mode,
            max_workers=cfg['loader']['num_workers'],
            custom_metadata=custom_metadata,
        )
        print('Finished the upload of the dataset.')

    if path_to_embeddings:
        name = cfg['embedding_name']
        print('Starting upload of embeddings.')
        api_workflow_client.upload_embeddings(
            path_to_embeddings_csv=path_to_embeddings, name=name)
        print('Finished upload of embeddings.')

    if custom_metadata is not None and not input_dir:
        # upload custom metadata separately
        api_workflow_client.upload_custom_metadata(
            custom_metadata,
            verbose=True,
            max_workers=cfg['loader']['num_workers'],
        )

    if new_dataset_name:
        print(
            f'The dataset_id of the newly created dataset is '
            f'{bcolors.OKBLUE}{api_workflow_client.dataset_id}{bcolors.ENDC}')

    os.environ[cfg['environment_variable_names']
               ['lightly_last_dataset_id']] = api_workflow_client.dataset_id
Ejemplo n.º 29
0
def crop_dataset_by_bounding_boxes_and_save(
        dataset: LightlyDataset,
        output_dir: str,
        bounding_boxes_list_list: List[List[BoundingBox]],
        class_indices_list_list: List[List[int]],
        class_names: List[str] = None) -> List[List[str]]:
    """Crops all images in a dataset by the bounding boxes and saves them in the output dir

    Args:
        dataset:
            The dataset with the images to be cropped. Must contain M images.
        output_dir:
            The output directory to saved the cropped images to.
        bounding_boxes_list_list:
            The bounding boxes of the detections for each image. Must have M sublists, one for each image.
            Each sublist contains the bounding boxes for each detection, thus N_m elements.
        class_indices_list_list:
            The object class ids of the detections for each image. Must have M sublists, one for each image.
            Each sublist contains the bounding boxes for each detection, thus N_m elements.
        class_names:
            The names of the classes, used to map the class id to the class name.


    Returns:
        The filepaths to all saved cropped images. Has M sublists, one for each image.
        Each sublist contains the filepath of the crop each detection, thus N_m elements.

    """
    filenames_images = dataset.get_filenames()
    if len(filenames_images) != len(bounding_boxes_list_list) or len(
            filenames_images) != len(class_indices_list_list):
        raise ValueError(
            "There must be one bounding box and class index list for each image in the datasets,"
            "but the lengths dont align.")

    cropped_image_filepath_list_list: List[List[str]] = []

    print(f"Cropping objects out of {len(filenames_images)} images...")
    for filename_image, class_indices, bounding_boxes in \
            tqdm(zip(filenames_images, class_indices_list_list, bounding_boxes_list_list)):

        if not len(class_indices) == len(bounding_boxes):
            warnings.warn(
                UserWarning(
                    f"Length of class indices ({len(class_indices)} does not equal length of bounding boxes"
                    f"({len(bounding_boxes)}. This is an error in the input arguments. "
                    f"Skipping this image {filename_image}."))
            continue

        filepath_image = dataset.get_filepath_from_filename(filename_image)
        filepath_image_base, image_extension = os.path.splitext(filepath_image)

        filepath_out_dir = os.path.join(output_dir, filename_image)\
            .replace(image_extension, '')
        Path(filepath_out_dir).mkdir(parents=True, exist_ok=True)

        image = Image.open(filepath_image)

        cropped_images_filepaths = []
        # For every image, crop out multiple cropped images, one for each
        # bounding box
        for index, (class_index, bbox) in \
                enumerate((zip(class_indices, bounding_boxes))):

            # determine the filename and filepath of the cropped image
            if class_names:
                class_name = class_names[class_index]
            else:
                class_name = f"class{class_index}"
            cropped_image_last_filename = f'{index}_{class_name}{image_extension}'
            cropped_image_filepath = os.path.join(filepath_out_dir,
                                                  cropped_image_last_filename)

            # crop out the image and save it
            w, h = image.size
            crop_box = (w * bbox.x0, h * bbox.y0, w * bbox.x1, h * bbox.y1)
            crop_box = tuple(int(i) for i in crop_box)
            cropped_image = image.crop(crop_box)
            cropped_image.save(cropped_image_filepath)

            # add the filename of the cropped image to the corresponding list
            cropped_image_filename: str = os.path.join(
                filename_image.replace(image_extension, ''),
                cropped_image_last_filename)
            cropped_images_filepaths.append(cropped_image_filename)

        cropped_image_filepath_list_list.append(cropped_images_filepaths)

    return cropped_image_filepath_list_list
Ejemplo n.º 30
0
    def forward(self, x):
        x = self.backbone(x).flatten(start_dim=1)
        z = self.projection_head(x)
        return z


resnet = torchvision.models.resnet18()
backbone = nn.Sequential(*list(resnet.children())[:-1])
model = BarlowTwins(backbone)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

cifar10 = torchvision.datasets.CIFAR10("datasets/cifar10", download=True)
dataset = LightlyDataset.from_torch_dataset(cifar10)
# or create a dataset from a folder containing images or videos:
# dataset = LightlyDataset("path/to/folder")

collate_fn = ImageCollateFunction(input_size=32)

dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=256,
    collate_fn=collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=8,
)

criterion = BarlowTwinsLoss()