Example #1
0
    def convert(self,
                split_column: Union[str] = None,
                image_format: str = 'png',
                parallel: bool = True,
                rewrite: bool = False) -> None:
        r"""
        Converts the dataset to the tfrecord binary format.

        Parameters
        ----------
        split_column
            The column containing the dataset split.
        image_format
            The format in which the images will be encoded.
        parallel
            Whether to use multiple processes to generate the tfrecord
            binary file or not.
        rewrite
        """
        if split_column is None:
            split_column = DatasetSplitKeys.RANDOM
        else:
            split_column = DatasetSplitKeys(split_column)

        if self.df.get(split_column.value) is None:
            raise ValueError()

        if parallel:

            def _convert_partition(part):
                return self.convert_partition(partition=part,
                                              split_column=split_column,
                                              image_format=image_format,
                                              parallel=parallel,
                                              rewrite=rewrite)

            with ThreadPoolExecutor() as executor:
                list(
                    progress_bar(executor.map(_convert_partition,
                                              DatasetPartitionKeys),
                                 total=len(DatasetPartitionKeys),
                                 desc='Converting'))
        else:
            for partition in progress_bar(DatasetPartitionKeys,
                                          total=len(DatasetPartitionKeys),
                                          desc='Converting'):
                self.convert_partition(partition=partition,
                                       split_column=split_column,
                                       image_format=image_format,
                                       parallel=parallel,
                                       rewrite=rewrite)
Example #2
0
    def create_random_split(self,
                            val_prop: float = 0.1,
                            test_prop: float = 0.1) -> None:
        r"""

        Parameters
        ----------
        val_prop
        test_prop

        Returns
        -------

        """
        self.df[
            DatasetSplitKeys.RANDOM.value] = DatasetPartitionKeys.TRAIN.value

        for i, row in progress_bar(self.df.iterrows(),
                                   total=len(self),
                                   desc='Randomizing'):
            sample = tf.random.uniform((), minval=0, maxval=1)
            if sample <= val_prop:
                self.df.at[i, DatasetSplitKeys.RANDOM.value] = (
                    DatasetPartitionKeys.VAL.value)
            if val_prop < sample <= test_prop + val_prop:
                self.df.at[i, DatasetSplitKeys.RANDOM.value] = (
                    DatasetPartitionKeys.TEST.value)
Example #3
0
    def upload_images_to_s3(
        self,
        bucket: str,
        key: str,
        parallel: bool = True,
        image_format: str = 'png',
        update_df=True,
    ) -> None:
        r"""

        Parameters
        ----------
        bucket
        key
        parallel
        image_format
        update_df

        Returns
        -------

        """
        if parallel:

            def _upload_image_to_s3(index):
                return self._upload_image_from_row_index(
                    index=index,
                    bucket=bucket,
                    key=key,
                    image_format=image_format,
                    update_df=update_df)

            with ThreadPoolExecutor() as executor:
                list(
                    progress_bar(executor.map(_upload_image_to_s3,
                                              range(len(self))),
                                 total=len(self)))
        else:
            for i in progress_bar(range(len(self)), total=len(self)):
                self._upload_image_from_row_index(index=i,
                                                  bucket=bucket,
                                                  key=key,
                                                  image_format=image_format,
                                                  update_df=update_df)
Example #4
0
    def save(
        self,
        parallel: bool = True,
        compress: bool = False,
        image_format: str = 'png',
        rewrite: bool = False,
    ) -> None:
        r"""

        Parameters
        ----------
        parallel
        compress
        image_format
        rewrite

        Returns
        -------

        """
        if parallel:

            def _save_row_from_index(index):
                return self._save_row_from_index(index=index,
                                                 compress=compress,
                                                 image_format=image_format,
                                                 rewrite=rewrite)

            with ThreadPoolExecutor() as executor:
                list(
                    progress_bar(executor.map(_save_row_from_index,
                                              range(len(self))),
                                 total=len(self)))
        else:
            for i in progress_bar(range(len(self)), total=len(self)):
                self._save_row_from_index(index=i,
                                          compress=compress,
                                          image_format=image_format,
                                          rewrite=rewrite)

        self._save(parsed=self.serialize(),
                   path=self.cache_path / f'{self.name}.json',
                   compress=compress)
Example #5
0
def _get_jobs_sequential(bucket, keys, show_progress):
    r"""

    Parameters
    ----------
    bucket
    keys
    show_progress

    Returns
    -------

    """
    if show_progress:
        keys = progress_bar(keys)
    for key in keys:
        yield json.load(read_from_s3(bucket, key))
Example #6
0
def _get_jobs_parallel(bucket, keys, show_progress):
    r"""

    Parameters
    ----------
    bucket
    keys
    show_progress

    Returns
    -------

    """
    def _load(key):
        return json.load(read_from_s3(bucket, key))

    with ThreadPoolExecutor() as executor:
        threads = executor.map(_load, keys)
        if show_progress:
            threads = progress_bar(threads, total=len(keys))
        return list(threads)
Example #7
0
def create_df_from_s3(
        bucket: str,
        key: Union[List[str], str],
        pattern: str,
        n_jobs: Optional[int] = None,
        parallel: bool = True,
        show_progress: bool = False
) -> pd.DataFrame:
    r"""

    Parameters
    ----------
    bucket
    key
    pattern
    n_jobs
    parallel
    show_progress

    Returns
    -------

    """
    key = [key] if isinstance(key, str) else key

    all_jobs = []
    for key in progress_bar(key):
        jobs = get_jobs(
            bucket=bucket,
            key=key,
            pattern=pattern,
            n_jobs=n_jobs,
            parallel=parallel,
            show_progress=show_progress
        )
        all_jobs += jobs

    examples = create_examples_from_jobs(all_jobs)
    return create_df_from_examples(examples)
Example #8
0
    def convert_partition(self,
                          partition: Union[DatasetPartitionKeys, str],
                          split_column: Union[DatasetSplitKeys, str] = None,
                          image_format: str = 'png',
                          parallel: bool = True,
                          rewrite: bool = False) -> None:
        r"""

        Parameters
        ----------
        partition
            The partition key to be converted.
        split_column
            The column containing the dataset split.
        image_format
            The format in which the images will be encoded.
        parallel
            Whether to use multiple processes to generate the tfrecord
            binary file or not.
        rewrite

        """
        if split_column is None:
            split_column = DatasetSplitKeys.RANDOM
        else:
            split_column = DatasetSplitKeys(split_column)

        partition = DatasetPartitionKeys(partition)

        indices = self.df[self.df[split_column.value] == partition.value].index

        file_name = f'{partition.value}.tfrecord'
        file_path = self.cache_path / split_column.value / file_name
        if file_path.exists() and not rewrite:
            warn(
                f'{file_path} already exist. Set the rewrite argument to True '
                f'in order to rewrite it.')
            return

        file_path.parent.mkdir(parents=True, exist_ok=True)

        with tf.io.TFRecordWriter(str(file_path)) as writer:
            if parallel:

                def _write_image(img):
                    return self._write_example(writer=writer,
                                               image=img,
                                               image_format=image_format)

                with ThreadPoolExecutor() as executor:
                    list(
                        progress_bar(executor.map(_write_image,
                                                  self.images[indices]),
                                     total=len(indices),
                                     desc=partition.name))
            else:
                for image in progress_bar(self.images[indices],
                                          desc=partition.name):
                    self._write_example(
                        writer=writer,
                        image=image,
                        image_format=image_format,
                    )