Example #1
0
    def create_subset(self) -> Tuple[LightlySubset, List[str]]:
        base_dataset = LightlyDataset(input_dir=self.input_dir)
        filenames_base_dataset = base_dataset.get_filenames()

        no_samples_subset = int(len(filenames_base_dataset) * 0.5)
        filenames_subset = random.sample(filenames_base_dataset, no_samples_subset)

        subset = LightlySubset(base_dataset=base_dataset, filenames_subset=filenames_subset)
        return subset, filenames_subset
Example #2
0
    def create_video_subset(self, seed=0) -> Tuple[LightlySubset, List[str]]:
        random.seed(seed)
        self.create_video_dataset(n_videos=5, n_frames_per_video=10)
        base_dataset = LightlyDataset(self.input_dir)
        filenames_base_dataset = base_dataset.get_filenames()

        no_samples_subset = int(len(filenames_base_dataset) * 0.5)
        filenames_subset = random.sample(filenames_base_dataset, no_samples_subset)

        subset = LightlySubset(base_dataset=base_dataset, filenames_subset=filenames_subset)
        return subset, filenames_subset
Example #3
0
def create_new_dataset_with_embeddings(path_to_dataset: str,
                                       token: str,
                                       dataset_name: str) -> ApiWorkflowClient:
    api_workflow_client = ApiWorkflowClient(token=token)

    # create the dataset
    api_workflow_client.create_new_dataset_with_unique_name(dataset_basename=dataset_name)

    # upload to the dataset
    api_workflow_client.upload_dataset(input=path_to_dataset)

    # calculate and save the embeddings
    path_to_embeddings_csv = f"{path_to_dataset}/embeddings.csv"
    if not os.path.isfile(path_to_embeddings_csv):
        dataset = LightlyDataset(input_dir=path_to_dataset)
        embeddings = np.random.normal(size=(len(dataset.dataset.samples), 32))
        filepaths, labels = zip(*dataset.dataset.samples)
        filenames = [filepath[len(path_to_dataset):].lstrip('/') for filepath in filepaths]
        print("Starting save of embeddings")
        save_embeddings(path_to_embeddings_csv, embeddings, labels, filenames)
        print("Finished save of embeddings")

    # upload the embeddings
    api_workflow_client.upload_embeddings(path_to_embeddings_csv=path_to_embeddings_csv, name="embedding_1")

    return api_workflow_client
Example #4
0
def t_est_api_with_matrix(path_to_dataset: str,
                          token: str,
                          dataset_name: str = "test_api_from_pip"):

    no_samples = len(LightlyDataset(input_dir=path_to_dataset).dataset.samples)
    assert no_samples >= 100, "Test needs at least 100 samples in the dataset!"

    api_workflow_client = create_new_dataset_with_embeddings(
        path_to_dataset=path_to_dataset,
        token=token,
        dataset_name=dataset_name)

    for method in [
            SamplingMethod.CORAL, SamplingMethod.CORESET, SamplingMethod.RANDOM
    ]:
        for query_tag_name in ['initial-tag', "query_tag_name_xyz"]:
            for preselected_tag_name in [None, "preselected_tag_name_xyz"]:
                print(
                    f"Starting AL run with method '{method}', query_tag '{query_tag_name}' "
                    f"and preselected_tag '{preselected_tag_name}'.")
                t_est_active_learning(api_workflow_client, method,
                                      query_tag_name, preselected_tag_name)

    api_workflow_client.delete_dataset_by_id(api_workflow_client.dataset_id)

    print(
        "Success of the complete test suite! The dataset on the server was deleted again."
    )
Example #5
0
def create_new_dataset_with_embeddings(path_to_dataset: str,
                                       token: str,
                                       dataset_name: str) -> ApiWorkflowClient:
    api_workflow_client = ApiWorkflowClient(token=token)

    # create the dataset
    api_workflow_client.create_new_dataset_with_unique_name(dataset_basename=dataset_name)

    # upload to the dataset
    initialize(config_path="../../lightly/cli/config", job_name="test_app")
    cfg = compose(config_name="config", overrides=[
        f"input_dir='{path_to_dataset}'",
        f"token='{token}'",
        f"dataset_id={api_workflow_client.dataset_id}"
        ])
    upload_cli(cfg)

    # calculate and save the embeddings
    path_to_embeddings_csv = f"{path_to_dataset}/embeddings.csv"
    if not os.path.isfile(path_to_embeddings_csv):
        dataset = LightlyDataset(input_dir=path_to_dataset)
        embeddings = np.random.normal(size=(len(dataset.dataset.samples), 32))
        filepaths, labels = zip(*dataset.dataset.samples)
        filenames = [filepath[len(path_to_dataset):].lstrip('/') for filepath in filepaths]
        print("Starting save of embeddings")
        save_embeddings(path_to_embeddings_csv, embeddings, labels, filenames)
        print("Finished save of embeddings")

    # upload the embeddings
    print("Starting upload of embeddings.")
    api_workflow_client.upload_embeddings(path_to_embeddings_csv=path_to_embeddings_csv, name="embedding_1")
    print("Finished upload of embeddings.")

    return api_workflow_client
Example #6
0
    def test_lightly_subset_dump(self):
        subset, filenames_subset = self.create_subset()
        dataset = subset

        out_dir = tempfile.mkdtemp()
        dataset.dump(out_dir)

        files_output_dir = LightlyDataset(input_dir=out_dir).get_filenames()
        assert set(files_output_dir) == set(dataset.get_filenames())
Example #7
0
    def __init__(self, base_dataset: LightlyDataset, filenames_subset: List[str]):
        """Creates a subset of a LightlyDataset.

        Args:
            base_dataset:
                The dataset to subset from.
            filenames_subset:
                The filenames of the samples to be part of the subset.
        """
        self.base_dataset = base_dataset
        self.filenames_subset = filenames_subset

        dict_base_dataset_filename_index: Dict[str, int] = dict()
        for index in range(len(base_dataset)):
            fname = base_dataset.index_to_filename(self.dataset, index)
            dict_base_dataset_filename_index[fname] = index

        self.mapping_subset_index_to_baseset_index = \
            [dict_base_dataset_filename_index[filename] for filename in filenames_subset]
Example #8
0
    def upload_dataset(self,
                       input: Union[str, LightlyDataset],
                       max_workers: int = 8,
                       mode: str = 'thumbnails',
                       verbose: bool = True):
        """Uploads a dataset to to the Lightly cloud solution.

        Args:
            input:
                one of the following:
                    - the path to the dataset, e.g. "path/to/dataset"
                    - the dataset in form of a LightlyDataset
            max_workers:
                Maximum number of workers uploading images in parallel.
            max_requests:
                Maximum number of requests a single worker can do before he has
                to wait for the others.
            mode:
                One of [full, thumbnails, metadata]. Whether to upload thumbnails,
                full images, or metadata only.

        Raises:
            ValueError if dataset is too large or input has the wrong type
            RuntimeError if the connection to the server failed.

        """
        no_tags_on_server = len(self._get_all_tags())
        if no_tags_on_server > 0:
            warnings.warn(
                f"Dataset with id {self.dataset_id} has already been completely uploaded to the platform. Skipping upload."
            )
            return

        # Check input variable 'input'
        if isinstance(input, str):
            dataset = LightlyDataset(input_dir=input)
        elif isinstance(input, LightlyDataset):
            dataset = input
        else:
            raise ValueError(
                f"input must either be a LightlyDataset or the path to the dataset as str, "
                f"but is of type {type(input)}")

        # check the allowed dataset size
        max_dataset_size_str = self.quota_api.get_quota_maximum_dataset_size()
        max_dataset_size = int(max_dataset_size_str)
        if len(dataset) > max_dataset_size:
            msg = f'Your dataset has {len(dataset)} samples which'
            msg += f' is more than the allowed maximum of {max_dataset_size}'
            raise ValueError(msg)

        # handle the case where len(dataset) < max_workers
        max_workers = min(len(dataset), max_workers)
        max_workers = max(max_workers, 1)

        # upload the samples
        if verbose:
            print(f'Uploading images (with {max_workers} workers).',
                  flush=True)

        pbar = tqdm.tqdm(unit='imgs', total=len(dataset))
        tqdm_lock = tqdm.tqdm.get_lock()

        # define lambda function for concurrent upload
        def lambda_(i):
            # load image
            image, label, filename = dataset[i]
            # try to upload image
            try:
                self._upload_single_image(
                    image=image,
                    label=label,
                    filename=filename,
                    mode=mode,
                )
                success = True
            except Exception as e:
                warnings.warn(
                    f"Upload of image {filename} failed with error {e}")
                success = False

            # update the progress bar
            tqdm_lock.acquire()  # lock
            pbar.update(1)  # update
            tqdm_lock.release()  # unlock
            # return whether the upload was successful
            return success

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = list(
                executor.map(lambda_, [i for i in range(len(dataset))],
                             chunksize=1))

        if not all(results):
            msg = 'Warning: Unsuccessful upload(s)! '
            msg += 'This could cause problems when uploading embeddings.'
            msg += 'Failed at image: {}'.format(results.index(False))
            warnings.warn(msg)

        # set image type of data and create initial tag
        if mode == 'full':
            img_type = 'full'
        elif mode == 'thumbnails':
            img_type = 'thumbnail'
        else:
            img_type = 'meta'

        initial_tag_create_request = InitialTagCreateRequest(
            img_type=img_type, creator=TagCreator.USER_PIP)
        self.tags_api.create_initial_tag_by_dataset_id(
            body=initial_tag_create_request, dataset_id=self.dataset_id)
Example #9
0
 def test_upload_dataset_from_dataset(self):
     dataset = LightlyDataset.from_torch_dataset(self.dataset)
     self.api_workflow_client.upload_dataset(input=dataset)
    def upload_dataset(self,
                       input: Union[str, LightlyDataset],
                       max_workers: int = 8,
                       mode: str = 'thumbnails',
                       verbose: bool = True,
                       custom_metadata: Union[Dict, None] = None):
        """Uploads a dataset to to the Lightly cloud solution.

        Args:
            input:
                Either the path to the dataset, e.g. "path/to/dataset",
                or the dataset in form of a LightlyDataset
            max_workers:
                Maximum number of workers uploading images in parallel.
            max_requests:
                Maximum number of requests a single worker can do before he has
                to wait for the others.
            mode:
                One of [full, thumbnails, metadata]. Whether to upload
                thumbnails, full images, or metadata only.

        Raises:
            ValueError:
                If dataset is too large or input has the wrong type
            RuntimeError:
                If the connection to the server failed.

        """

        # get all tags of the dataset
        tags = self.get_all_tags()
        if len(tags) > 0:
            print(f'Dataset with id {self.dataset_id} has {len(tags)} tags.',
                  flush=True)

        # parse "input" variable
        if isinstance(input, str):
            dataset = LightlyDataset(input_dir=input)
        elif isinstance(input, LightlyDataset):
            dataset = input
        else:
            raise ValueError(
                f'input must either be a LightlyDataset or the path to the'
                f'dataset as str, but has type {type(input)}')

        # handle the case where len(dataset) < max_workers
        max_workers = min(len(dataset), max_workers)
        max_workers = max(max_workers, 1)

        # upload the samples
        if verbose:
            print(f'Uploading images (with {max_workers} workers).',
                  flush=True)

        # TODO: remove _size_in_bytes from image_processing
        image_processing.metadata._size_in_bytes = \
            lambda img: 0 # pylint: disable=protected-access

        # get the filenames of the samples already on the server
        samples = self._samples_api.get_samples_by_dataset_id(
            dataset_id=self.dataset_id)
        filenames_on_server = [sample.file_name for sample in samples]
        filenames_on_server_set = set(filenames_on_server)
        if len(filenames_on_server) > 0:
            print(
                f'Found {len(filenames_on_server)} images already on the server'
                ', they are skipped during the upload.')

        # check the maximum allowed dataset size
        total_filenames = set(
            dataset.get_filenames()).union(filenames_on_server_set)
        max_dataset_size = \
            int(self._quota_api.get_quota_maximum_dataset_size())
        if len(total_filenames) > max_dataset_size:
            msg = f'Your dataset has {len(dataset)} samples which'
            msg += f' is more than the allowed maximum of {max_dataset_size}'
            raise ValueError(msg)

        # index custom metadata by filename (only if it exists)
        filename_to_metadata = {}
        if custom_metadata is not None:
            self.verify_custom_metadata_format(custom_metadata)
            filename_to_metadata = self.index_custom_metadata_by_filename(
                dataset.get_filenames(),
                custom_metadata,
            )

        # get the datasource
        try:
            datasource_config: DatasourceConfigBase = self.get_datasource()
            datasource_type = datasource_config['type']
        except ApiException:
            datasource_type = 'LIGHTLY'  # default to lightly datasource

        # register dataset upload
        job_status_meta = JobStatusMeta(
            total=len(total_filenames),
            processed=len(filenames_on_server),
            is_registered=True,
            upload_method=JobStatusUploadMethod.USER_PIP,
        )
        self._datasets_api.register_dataset_upload_by_id(
            job_status_meta, self.dataset_id)

        pbar = tqdm.tqdm(
            unit='imgs',
            total=len(total_filenames) - len(filenames_on_server),
        )
        tqdm_lock = tqdm.tqdm.get_lock()

        # define lambda function for concurrent upload
        def lambda_(i):
            # load image
            image, _, filename = dataset[i]
            if filename in filenames_on_server_set:
                # the sample was already uploaded
                return True

            filepath = dataset.get_filepath_from_filename(filename, image)

            # get custom metadata (evaluates to None if there is none)
            custom_metadata_item = filename_to_metadata.get(filename, None)

            # try to upload image
            try:
                self._upload_single_image(
                    image=image,
                    filename=filename,
                    filepath=filepath,
                    mode=mode,
                    custom_metadata=custom_metadata_item,
                    datasource_type=datasource_type,
                )
                success = True
            except Exception as e:  # pylint: disable=broad-except
                warnings.warn(
                    f'Upload of image {filename} failed with error {e}')
                success = False

            # update the progress bar
            tqdm_lock.acquire()
            pbar.update(1)
            tqdm_lock.release()
            # return whether the upload was successful
            return success

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = list(
                executor.map(lambda_, [i for i in range(len(dataset))],
                             chunksize=1))

        if not all(results):
            msg = 'Warning: Unsuccessful upload(s)! '
            msg += 'This could cause problems when uploading embeddings.'
            msg += 'Failed at image: {}'.format(results.index(False))
            warnings.warn(msg)

        # set image type of data and create initial tag
        if mode == 'full':
            img_type = 'full'
        elif mode == 'thumbnails':
            img_type = 'thumbnail'
        else:
            img_type = 'meta'

        if len(tags) == 0:
            # create initial tag
            initial_tag_create_request = InitialTagCreateRequest(
                img_type=img_type, creator=TagCreator.USER_PIP)
            self._tags_api.create_initial_tag_by_dataset_id(
                body=initial_tag_create_request,
                dataset_id=self.dataset_id,
            )
        else:
            # upsize existing tags
            upsize_tags_request = TagUpsizeRequest(
                upsize_tag_name=datetime.now().strftime('%Y%m%d_%Hh%Mm%Ss'),
                upsize_tag_creator=TagCreator.USER_PIP,
            )
            self._tags_api.upsize_tags_by_dataset_id(
                body=upsize_tags_request,
                dataset_id=self.dataset_id,
            )