Esempio n. 1
0
    def test_video_dataset_available(self):
        self.create_video_dataset()
        dataset = LightlyDataset(input_dir=self.input_dir)

        out_dir = tempfile.mkdtemp()
        dataset.dump(out_dir, dataset.get_filenames()[(len(dataset) // 2):])
        self.assertEqual(len(os.listdir(out_dir)), len(dataset) // 2)
        for filename in os.listdir(out_dir):
            self.assertIn(filename,
                          dataset.get_filenames()[(len(dataset) // 2):])
Esempio n. 2
0
    def test_embed_correct_order(self):
        # get dataset and encoder
        transform = torchvision.transforms.ToTensor()
        dataset = LightlyDataset(self.folder_path, transform=transform)
        encoder = get_model_from_config(self.cfg)
        if torch.cuda.is_available():
            device = torch.device('cuda')
        else:
            device = torch.device('cpu')

        manual_seed(42)
        dataloader_1_worker = DataLoader(
            dataset, shuffle=True, num_workers=0, batch_size=4
        )
        embeddings_1_worker, labels_1_worker, filenames_1_worker = encoder.embed(
            dataloader_1_worker,
            device=device,
        )

        manual_seed(43)
        dataloader_4_worker = DataLoader(
            dataset, shuffle=True, num_workers=4, batch_size=4
        )
        embeddings_4_worker, labels_4_worker, filenames_4_worker = encoder.embed(
            dataloader_4_worker, 
            device=device,
        )

        np.testing.assert_allclose(embeddings_1_worker, embeddings_4_worker, rtol=1e-5)
        np.testing.assert_allclose(labels_1_worker, labels_4_worker, rtol=1e-5)

        self.assertListEqual(filenames_1_worker, filenames_4_worker)
        self.assertListEqual(filenames_1_worker, dataset.get_filenames())
Esempio n. 3
0
    def test_create_lightly_dataset_from_folder(self):
        n_subfolders = 5
        n_samples_per_subfolder = 10
        n_tot_files = n_subfolders * n_samples_per_subfolder

        dataset_dir, folder_names, sample_names = self.create_dataset(
            n_subfolders, n_samples_per_subfolder)

        dataset = LightlyDataset(from_folder=dataset_dir)
        filenames = dataset.get_filenames()

        fnames = []
        for dir_name in folder_names:
            for fname in sample_names:
                fnames.append(os.path.join(dir_name, fname))

        self.assertEqual(len(filenames), n_tot_files)
        self.assertEqual(len(dataset), n_tot_files)
        self.assertListEqual(sorted(fnames), sorted(filenames))

        out_dir = tempfile.mkdtemp()
        dataset.dump(out_dir)
        self.assertEqual(
            sum(
                len(os.listdir(os.path.join(out_dir, subdir)))
                for subdir in os.listdir(out_dir)),
            len(dataset),
        )

        shutil.rmtree(dataset_dir)
        shutil.rmtree(out_dir)
Esempio n. 4
0
    def test_create_lightly_dataset_from_folder_nosubdir(self):

        # create a dataset
        n_tot = 100
        dataset = torchvision.datasets.FakeData(size=n_tot,
                                                image_size=(3, 32, 32))

        tmp_dir = tempfile.mkdtemp()
        sample_names = [f'img_{i}.jpg' for i in range(n_tot)]
        for sample_idx in range(n_tot):

            data = dataset[sample_idx]
            path = os.path.join(tmp_dir, sample_names[sample_idx])
            data[0].save(path)

        # create lightly dataset
        dataset = LightlyDataset(from_folder=tmp_dir)
        filenames = dataset.get_filenames()

        # tests
        self.assertEqual(len(filenames), n_tot)
        self.assertEqual(len(dataset), n_tot)
        self.assertListEqual(sorted(sample_names), sorted(filenames))

        for i in range(n_tot):
            sample, target, fname = dataset[i]
Esempio n. 5
0
    def test_filenames_dataset_with_subdir(self):
        tmp_dir, folder_names, sample_names = self.create_dataset()
        folder_name_to_target = {
            folder_name: i
            for i, folder_name in enumerate(folder_names)
        }
        all_filenames = [
            os.path.join(folder_name, sample_name)
            for folder_name in folder_names for sample_name in sample_names
        ]
        n_samples = int(len(all_filenames) / 2)
        for i in range(5):
            np.random.seed(i)
            filenames = np.random.choice(all_filenames,
                                         n_samples,
                                         replace=False)

            dataset = LightlyDataset(input_dir=tmp_dir, filenames=filenames)
            filenames_dataset = dataset.get_filenames()
            self.assertEqual(len(filenames_dataset), len(dataset))
            self.assertEqual(len(filenames_dataset), len(filenames))
            self.assertEqual(set(filenames_dataset), set(filenames))
            filenames_dataset = set(filenames_dataset)
            for image, target, filename in dataset:
                self.assertIsInstance(image, Image)
                folder_name = filename.split(sep=os.sep)[0]
                self.assertEqual(target, folder_name_to_target[folder_name])
                self.assertIsInstance(filename, str)
                assert filename in filenames_dataset
Esempio n. 6
0
    def test_filenames_dataset_no_subdir(self):
        # create a dataset
        n_tot = 100
        dataset = torchvision.datasets.FakeData(size=n_tot,
                                                image_size=(3, 32, 32))

        tmp_dir = tempfile.mkdtemp()
        all_filenames = [f'img_{i}.jpg' for i in range(n_tot)]
        for sample_idx in range(n_tot):
            data = dataset[sample_idx]
            path = os.path.join(tmp_dir, all_filenames[sample_idx])
            data[0].save(path)

        n_samples = len(all_filenames) // 2
        for i in range(5):
            np.random.seed(i)
            filenames = np.random.choice(all_filenames,
                                         n_samples,
                                         replace=False)

            dataset = LightlyDataset(input_dir=tmp_dir, filenames=filenames)
            filenames_dataset = dataset.get_filenames()
            self.assertEqual(len(filenames_dataset), len(dataset))
            self.assertEqual(len(filenames_dataset), len(filenames))
            self.assertEqual(set(filenames_dataset), set(filenames))
            filenames_dataset = set(filenames_dataset)
            for image, target, filename in dataset:
                self.assertIsInstance(image, Image)
                self.assertEqual(target, 0)
                self.assertIsInstance(filename, str)
                self.assertIn(filename, filenames_dataset)
Esempio n. 7
0
    def test_create_lightly_with_indices(self):
        n_subfolders = 5
        n_samples_per_subfolder = 10
        n_tot_files = n_subfolders * n_samples_per_subfolder

        indices = random.sample(range(n_tot_files), 20)

        dataset_dir, folder_names, sample_names = self.create_dataset(
            n_subfolders, n_samples_per_subfolder)

        dataset = LightlyDataset(from_folder=dataset_dir, indices=indices)
        self.assertEqual(len(dataset), 20)
        self.assertLess(len(dataset), n_tot_files)

        filenames = dataset.get_filenames()
        self.assertEqual(len(filenames), 20)

        fnames = []
        for dir_name in folder_names:
            for fname in sample_names:
                fnames.append(os.path.join(dir_name, fname))

        fnames = [fnames[i] for i in indices]

        self.assertListEqual(sorted(fnames), sorted(filenames))

        shutil.rmtree(dataset_dir)
Esempio n. 8
0
def check_images(data_dir: str) -> Tuple[List[str], List[str]]:
    '''Iterate through a directory of images and find corrupt images

    Args:
        data_dir: Path to the directory containing the images

    Returns:
        (healthy_images, corrupt_images)
    '''
    dataset = LightlyDataset(input_dir=data_dir)
    filenames = dataset.get_filenames()

    def _is_corrupt(filename):
        try:
            image = Image.open(os.path.join(data_dir, filename))
            image.load()
        except (IOError, UnidentifiedImageError):
            return True
        else:
            return False

    mapped = concurrent.thread_map(_is_corrupt,
                                   filenames,
                                   chunksize=min(32, len(filenames)))
    healthy_images = [
        f for f, is_corrupt in zip(filenames, mapped) if not is_corrupt
    ]
    corrupt_images = [
        f for f, is_corrupt in zip(filenames, mapped) if is_corrupt
    ]
    return healthy_images, corrupt_images
Esempio n. 9
0
    def test_dataset_get_filenames(self):
        self.create_video_dataset()
        dataset = LightlyDataset(input_dir=self.input_dir)
        video_dataset = dataset.dataset

        # Get filenames using VideoDataset.get_filenames.
        video_dataset_filenames = video_dataset.get_filenames()

        # Get filenames using calls to VideoDataset.get_filename(index).
        # This removes the optimization introduced in VideoDatset.get_filenames.
        # Both methods should give the same result.
        get_filenames = VideoDataset.get_filenames
        del VideoDataset.get_filenames
        lightly_dataset_filenames = dataset.get_filenames()
        VideoDataset.get_filenames = get_filenames

        assert video_dataset_filenames == lightly_dataset_filenames
Esempio n. 10
0
    def test_create_lightly_dataset_from_folder_nosubdir(self):

        # create a dataset
        n_tot = 100
        tmp_dir, sample_names = self.create_dataset_no_subdir(n_tot)

        # create lightly dataset
        dataset = LightlyDataset(input_dir=tmp_dir)
        filenames = dataset.get_filenames()

        # tests
        self.assertEqual(len(filenames), n_tot)
        self.assertEqual(len(dataset), n_tot)
        self.assertListEqual(sorted(sample_names), sorted(filenames))

        for i in range(n_tot):
            sample, target, fname = dataset[i]
Esempio n. 11
0
    def test_dataset_plain(self):
        tmp_dir, _ = self.create_dataset_no_subdir(100)

        with self.subTest("no read rights files"):
            for subdir, dirs, files in os.walk(tmp_dir):
                for filename in files:
                    filepath = os.path.join(tmp_dir, filename)
                    os.chmod(filepath, 0o000)
            dataset = LightlyDataset(input_dir=tmp_dir)
            self.assertGreater(len(dataset.get_filenames()), 0)
            with self.assertRaises(PermissionError):
                for _ in dataset:
                    pass

        with self.subTest("no read rights root"):
            os.chmod(tmp_dir, 0o000)
            with self.assertRaises(PermissionError):
                dataset = LightlyDataset(input_dir=tmp_dir)
Esempio n. 12
0
def _crop_cli(cfg, is_cli_call=True):
    input_dir = cfg['input_dir']
    if input_dir and is_cli_call:
        input_dir = fix_input_path(input_dir)
    output_dir = cfg['output_dir']
    if output_dir and is_cli_call:
        output_dir = fix_input_path(output_dir)
    label_dir = cfg['label_dir']
    if label_dir and is_cli_call:
        label_dir = fix_input_path(label_dir)
    label_names_file = cfg['label_names_file']
    if label_names_file and len(label_names_file) > 0:
        if is_cli_call:
            label_names_file = fix_input_path(label_names_file)
        with open(label_names_file, 'r') as file:
            label_names_file_dict = yaml.full_load(file)
        class_names = label_names_file_dict['names']
    else:
        class_names = None

    dataset = LightlyDataset(input_dir)

    class_indices_list_list: List[List[int]] = []
    bounding_boxes_list_list: List[List[BoundingBox]] = []

    # YOLO-Specific
    for filename_image in dataset.get_filenames():
        filepath_image_base, image_extension = os.path.splitext(filename_image)
        filepath_label = os.path.join(label_dir, filename_image).replace(
            image_extension, '.txt')
        class_indices, bounding_boxes = read_yolo_label_file(
            filepath_label, float(cfg['crop_padding']))
        class_indices_list_list.append(class_indices)
        bounding_boxes_list_list.append(bounding_boxes)

    cropped_images_list_list = \
        crop_dataset_by_bounding_boxes_and_save(dataset, output_dir, bounding_boxes_list_list, class_indices_list_list, class_names)

    print(
        f'Cropped images are stored at: {bcolors.OKBLUE}{output_dir}{bcolors.ENDC}'
    )
    return cropped_images_list_list
Esempio n. 13
0
 def test_crop_dataset_by_bounding_boxes_and_save(self):
     dataset = LightlyDataset(self.cfg.input_dir)
     output_dir = self.cfg.output_dir
     no_files = len(dataset.get_filenames())
     bounding_boxes_list_list = [[BoundingBox(0, 0, 1, 1)]] * no_files
     class_indices_list_list = [[1]] * no_files
     class_names = ["class_0", "class_1"]
     with self.subTest("all_correct"):
         crop_dataset_by_bounding_boxes_and_save(dataset, output_dir,
                                                 bounding_boxes_list_list,
                                                 class_indices_list_list,
                                                 class_names)
     with self.subTest("wrong length of bounding_boxes_list_list"):
         with self.assertRaises(ValueError):
             crop_dataset_by_bounding_boxes_and_save(
                 dataset, output_dir, bounding_boxes_list_list[:-1],
                 class_indices_list_list, class_names)
     with self.subTest("wrong internal length of class_indices_list_list"):
         with self.assertWarns(UserWarning):
             class_indices_list_list[0] *= 2
             crop_dataset_by_bounding_boxes_and_save(
                 dataset, output_dir, bounding_boxes_list_list,
                 class_indices_list_list, class_names)
Esempio n. 14
0
    def test_video_dataset_filenames(self):
        self.create_video_dataset()
        all_filenames = self.filenames

        def filename_img_fits_video(filename_img: str):
            for filename_video in all_filenames:
                filename_video = filename_video[:-1 * len('.avi')]
                if filename_video in filename_img:
                    return True
            return False

        n_samples = int(len(all_filenames) / 2)
        np.random.seed(42)
        filenames = np.random.choice(all_filenames, n_samples, replace=False)

        dataset = LightlyDataset(input_dir=self.input_dir, filenames=filenames)

        filenames_dataset = dataset.get_filenames()
        for image, target, filename in dataset:
            self.assertIsInstance(image, Image)
            self.assertTrue(filename_img_fits_video(filename))

            self.assertIsInstance(filename, str)
            self.assertIn(filename, filenames_dataset)
import sys

from lightly.data import LightlyDataset
from lightly.utils import save_custom_metadata

if __name__ == "__main__":
    if len(sys.argv) == 1 + 2:
        input_dir, metadata_filename= \
            (sys.argv[1 + i] for i in range(2))
    else:
        raise ValueError(
            "ERROR in number of command line arguments, must be 2."
            "Example: python create_custom_metadata_from_input_dir.py input_dir metadata_filename"
        )

    dataset = LightlyDataset(input_dir)

    # create a list of pairs of (filename, metadata)
    custom_metadata = []
    for index, filename in enumerate(dataset.get_filenames()):
        metadata = {'index': index}
        custom_metadata.append((filename, metadata))

    save_custom_metadata(metadata_filename, custom_metadata)
Esempio n. 16
0
def crop_dataset_by_bounding_boxes_and_save(
        dataset: LightlyDataset,
        output_dir: str,
        bounding_boxes_list_list: List[List[BoundingBox]],
        class_indices_list_list: List[List[int]],
        class_names: List[str] = None) -> List[List[str]]:
    """Crops all images in a dataset by the bounding boxes and saves them in the output dir

    Args:
        dataset:
            The dataset with the images to be cropped. Must contain M images.
        output_dir:
            The output directory to saved the cropped images to.
        bounding_boxes_list_list:
            The bounding boxes of the detections for each image. Must have M sublists, one for each image.
            Each sublist contains the bounding boxes for each detection, thus N_m elements.
        class_indices_list_list:
            The object class ids of the detections for each image. Must have M sublists, one for each image.
            Each sublist contains the bounding boxes for each detection, thus N_m elements.
        class_names:
            The names of the classes, used to map the class id to the class name.


    Returns:
        The filepaths to all saved cropped images. Has M sublists, one for each image.
        Each sublist contains the filepath of the crop each detection, thus N_m elements.

    """
    filenames_images = dataset.get_filenames()
    if len(filenames_images) != len(bounding_boxes_list_list) or len(
            filenames_images) != len(class_indices_list_list):
        raise ValueError(
            "There must be one bounding box and class index list for each image in the datasets,"
            "but the lengths dont align.")

    cropped_image_filepath_list_list: List[List[str]] = []

    print(f"Cropping objects out of {len(filenames_images)} images...")
    for filename_image, class_indices, bounding_boxes in \
            tqdm(zip(filenames_images, class_indices_list_list, bounding_boxes_list_list)):

        if not len(class_indices) == len(bounding_boxes):
            warnings.warn(
                UserWarning(
                    f"Length of class indices ({len(class_indices)} does not equal length of bounding boxes"
                    f"({len(bounding_boxes)}. This is an error in the input arguments. "
                    f"Skipping this image {filename_image}."))
            continue

        filepath_image = dataset.get_filepath_from_filename(filename_image)
        filepath_image_base, image_extension = os.path.splitext(filepath_image)

        filepath_out_dir = os.path.join(output_dir, filename_image)\
            .replace(image_extension, '')
        Path(filepath_out_dir).mkdir(parents=True, exist_ok=True)

        image = Image.open(filepath_image)

        cropped_images_filepaths = []
        # For every image, crop out multiple cropped images, one for each
        # bounding box
        for index, (class_index, bbox) in \
                enumerate((zip(class_indices, bounding_boxes))):

            # determine the filename and filepath of the cropped image
            if class_names:
                class_name = class_names[class_index]
            else:
                class_name = f"class{class_index}"
            cropped_image_last_filename = f'{index}_{class_name}{image_extension}'
            cropped_image_filepath = os.path.join(filepath_out_dir,
                                                  cropped_image_last_filename)

            # crop out the image and save it
            w, h = image.size
            crop_box = (w * bbox.x0, h * bbox.y0, w * bbox.x1, h * bbox.y1)
            crop_box = tuple(int(i) for i in crop_box)
            cropped_image = image.crop(crop_box)
            cropped_image.save(cropped_image_filepath)

            # add the filename of the cropped image to the corresponding list
            cropped_image_filename: str = os.path.join(
                filename_image.replace(image_extension, ''),
                cropped_image_last_filename)
            cropped_images_filepaths.append(cropped_image_filename)

        cropped_image_filepath_list_list.append(cropped_images_filepaths)

    return cropped_image_filepath_list_list
Esempio n. 17
0
def upload_images_from_folder(path_to_folder: str,
                              dataset_id: str,
                              token: str,
                              max_workers: int = 8,
                              max_requests: int = 32,
                              mode: str = 'thumbnails'):
    """Uploads images from a directory to the Lightly cloud solution.

    Args:
        path_to_folder:
            Path to the folder containing the images.
        dataset_id:
            The unique identifier for the dataset.
        token:
            Token for authentication.
        max_workers:
            Maximum number of workers uploading images in parallel.
        max_requests:
            Maximum number of requests a single worker can do before he has 
            to wait for the others.
        mode:
            One of [full, thumbnails, metadata]. Whether to upload thumbnails, 
            full images, or metadata only.

    Raises:
        ValueError if dataset is too large.
        RuntimeError if dataset already has an initial tag.

    """

    bds = LightlyDataset(from_folder=path_to_folder)
    fnames = bds.get_filenames()

    api_max_dataset_size = get_user_quota(token)['maxDatasetSize']
    max_dataset_size = min(api_max_dataset_size, MAXIMUM_DATASET_SIZE)

    if len(fnames) > max_dataset_size:
        msg = f'Your dataset has {len(fnames)} samples which'
        msg += f' is more than the allowed maximum of {max_dataset_size}'
        raise ValueError(msg)

    tags = get_tags(dataset_id, token)
    if len(tags) > 0:
        tag_names = [t['name'] for t in tags]
        msg = 'Forbidden upload to dataset with existing tags: '
        msg += f'{tag_names}'
        raise RuntimeError(msg)

    def _upload_single_image(fname):

        # random delay of uniform[0, 0.01] seconds to prevent API bursts
        rnd_delay = random.random() * 0.01
        time.sleep(rnd_delay)

        # get PIL image handles, metadata, and check if corrupted
        metadata, is_corrupted = check_image(
            os.path.join(path_to_folder, fname))

        # filename is too long, cannot accept this file
        if not metadata:
            return False

        # upload sample
        basename = fname
        thumbname = None
        if mode in ['full', 'thumbnails'] and not is_corrupted:
            thumbname = '.'.join(basename.split('.')[:-1]) + '_thumb.webp'

        sample_upload_success = True
        try:
            sample_id = upload_sample_with_metadata(basename, thumbname,
                                                    metadata, dataset_id,
                                                    token)
        except RuntimeError:
            sample_upload_success = False

        # upload thumbnail
        thumbnail_upload_success = True
        if mode == 'thumbnails' and not is_corrupted:
            try:
                # try to get signed url for thumbnail
                signed_url = get_presigned_upload_url(thumbname, dataset_id,
                                                      sample_id, token)
                # try to create thumbnail
                image_path = os.path.join(path_to_folder, fname)
                with Image.open(image_path) as temp_image:
                    thumbnail = get_thumbnail_from_img(temp_image)
                # try to upload thumbnail
                upload_file_with_signed_url(
                    PIL_to_bytes(thumbnail, ext='webp', quality=70),
                    signed_url)
            except RuntimeError:
                thumbnail_upload_success = False

        # upload full image
        image_upload_success = True
        if mode == 'full' and not is_corrupted:
            try:
                # try to get signed url for image
                signed_url = get_presigned_upload_url(basename, dataset_id,
                                                      sample_id, token)

                # try to upload image
                image_path = os.path.join(path_to_folder, fname)
                with open(image_path, 'rb') as temp_image:
                    upload_file_with_signed_url(temp_image, signed_url)
            except RuntimeError:
                image_upload_success = False

        success = sample_upload_success
        success = success and thumbnail_upload_success
        success = success and image_upload_success
        return success

    n_batches = len(fnames) // max_requests
    n_batches = n_batches + 1 if len(fnames) % max_requests else n_batches
    fname_batches = [
        list(islice(fnames, i * max_requests, (i + 1) * max_requests))
        for i in range(n_batches)
    ]

    chunksize = max(max_requests // max_workers, 1)
    executor = ThreadPoolExecutor(max_workers=max_workers)

    pbar = tqdm.tqdm(unit='imgs', total=len(fnames))
    for i, batch in enumerate(fname_batches):
        mapped = executor.map(_upload_single_image, batch, chunksize=chunksize)
        mapped = list(mapped)
        if not all(mapped):
            msg = 'Warning: Unsuccessful upload(s) in batch {}! '.format(i)
            msg += 'This could cause problems when uploading embeddings.'
            msg += 'Failed at file: {}'.format(mapped.index(False))
            warnings.warn(msg)
        pbar.update(len(batch))

    # set image type of data and create initial tag
    if mode == 'full':
        put_image_type(dataset_id, token, mode)
    elif mode == 'thumbnails':
        put_image_type(dataset_id, token, 'thumbnail')
    else:
        put_image_type(dataset_id, token, 'meta')
    create_initial_tag(dataset_id, token)