def convert(self, split_column: Union[str] = None, image_format: str = 'png', parallel: bool = True, rewrite: bool = False) -> None: r""" Converts the dataset to the tfrecord binary format. Parameters ---------- split_column The column containing the dataset split. image_format The format in which the images will be encoded. parallel Whether to use multiple processes to generate the tfrecord binary file or not. rewrite """ if split_column is None: split_column = DatasetSplitKeys.RANDOM else: split_column = DatasetSplitKeys(split_column) if self.df.get(split_column.value) is None: raise ValueError() if parallel: def _convert_partition(part): return self.convert_partition(partition=part, split_column=split_column, image_format=image_format, parallel=parallel, rewrite=rewrite) with ThreadPoolExecutor() as executor: list( progress_bar(executor.map(_convert_partition, DatasetPartitionKeys), total=len(DatasetPartitionKeys), desc='Converting')) else: for partition in progress_bar(DatasetPartitionKeys, total=len(DatasetPartitionKeys), desc='Converting'): self.convert_partition(partition=partition, split_column=split_column, image_format=image_format, parallel=parallel, rewrite=rewrite)
def create_random_split(self, val_prop: float = 0.1, test_prop: float = 0.1) -> None: r""" Parameters ---------- val_prop test_prop Returns ------- """ self.df[ DatasetSplitKeys.RANDOM.value] = DatasetPartitionKeys.TRAIN.value for i, row in progress_bar(self.df.iterrows(), total=len(self), desc='Randomizing'): sample = tf.random.uniform((), minval=0, maxval=1) if sample <= val_prop: self.df.at[i, DatasetSplitKeys.RANDOM.value] = ( DatasetPartitionKeys.VAL.value) if val_prop < sample <= test_prop + val_prop: self.df.at[i, DatasetSplitKeys.RANDOM.value] = ( DatasetPartitionKeys.TEST.value)
def upload_images_to_s3( self, bucket: str, key: str, parallel: bool = True, image_format: str = 'png', update_df=True, ) -> None: r""" Parameters ---------- bucket key parallel image_format update_df Returns ------- """ if parallel: def _upload_image_to_s3(index): return self._upload_image_from_row_index( index=index, bucket=bucket, key=key, image_format=image_format, update_df=update_df) with ThreadPoolExecutor() as executor: list( progress_bar(executor.map(_upload_image_to_s3, range(len(self))), total=len(self))) else: for i in progress_bar(range(len(self)), total=len(self)): self._upload_image_from_row_index(index=i, bucket=bucket, key=key, image_format=image_format, update_df=update_df)
def save( self, parallel: bool = True, compress: bool = False, image_format: str = 'png', rewrite: bool = False, ) -> None: r""" Parameters ---------- parallel compress image_format rewrite Returns ------- """ if parallel: def _save_row_from_index(index): return self._save_row_from_index(index=index, compress=compress, image_format=image_format, rewrite=rewrite) with ThreadPoolExecutor() as executor: list( progress_bar(executor.map(_save_row_from_index, range(len(self))), total=len(self))) else: for i in progress_bar(range(len(self)), total=len(self)): self._save_row_from_index(index=i, compress=compress, image_format=image_format, rewrite=rewrite) self._save(parsed=self.serialize(), path=self.cache_path / f'{self.name}.json', compress=compress)
def _get_jobs_sequential(bucket, keys, show_progress): r""" Parameters ---------- bucket keys show_progress Returns ------- """ if show_progress: keys = progress_bar(keys) for key in keys: yield json.load(read_from_s3(bucket, key))
def _get_jobs_parallel(bucket, keys, show_progress): r""" Parameters ---------- bucket keys show_progress Returns ------- """ def _load(key): return json.load(read_from_s3(bucket, key)) with ThreadPoolExecutor() as executor: threads = executor.map(_load, keys) if show_progress: threads = progress_bar(threads, total=len(keys)) return list(threads)
def create_df_from_s3( bucket: str, key: Union[List[str], str], pattern: str, n_jobs: Optional[int] = None, parallel: bool = True, show_progress: bool = False ) -> pd.DataFrame: r""" Parameters ---------- bucket key pattern n_jobs parallel show_progress Returns ------- """ key = [key] if isinstance(key, str) else key all_jobs = [] for key in progress_bar(key): jobs = get_jobs( bucket=bucket, key=key, pattern=pattern, n_jobs=n_jobs, parallel=parallel, show_progress=show_progress ) all_jobs += jobs examples = create_examples_from_jobs(all_jobs) return create_df_from_examples(examples)
def convert_partition(self, partition: Union[DatasetPartitionKeys, str], split_column: Union[DatasetSplitKeys, str] = None, image_format: str = 'png', parallel: bool = True, rewrite: bool = False) -> None: r""" Parameters ---------- partition The partition key to be converted. split_column The column containing the dataset split. image_format The format in which the images will be encoded. parallel Whether to use multiple processes to generate the tfrecord binary file or not. rewrite """ if split_column is None: split_column = DatasetSplitKeys.RANDOM else: split_column = DatasetSplitKeys(split_column) partition = DatasetPartitionKeys(partition) indices = self.df[self.df[split_column.value] == partition.value].index file_name = f'{partition.value}.tfrecord' file_path = self.cache_path / split_column.value / file_name if file_path.exists() and not rewrite: warn( f'{file_path} already exist. Set the rewrite argument to True ' f'in order to rewrite it.') return file_path.parent.mkdir(parents=True, exist_ok=True) with tf.io.TFRecordWriter(str(file_path)) as writer: if parallel: def _write_image(img): return self._write_example(writer=writer, image=img, image_format=image_format) with ThreadPoolExecutor() as executor: list( progress_bar(executor.map(_write_image, self.images[indices]), total=len(indices), desc=partition.name)) else: for image in progress_bar(self.images[indices], desc=partition.name): self._write_example( writer=writer, image=image, image_format=image_format, )