def test_mnist_single_threaded() -> None: config = storage.LFSConfigurations(storage_dir_path="/tmp/") lfs_storage = storage.LFSStorage(configurations=config) dataset_id = "mnist" dataset_version = "1" util.cleanup_lfs_storage(configurations=config, dataset_id=dataset_id, dataset_version=dataset_version) @lfs_storage.cacheable(dataset_id=dataset_id, dataset_version=dataset_version) def make_dataset() -> dataref.LMDBDataRef: return util.make_mnist_test_dataset() # type: ignore stream_from_cache = make_dataset().stream() dataset_from_stream = tensorflow.make_tf_dataset(stream_from_cache) original_dataset = util.make_mnist_test_dataset() data_samples = util.compare_datasets(original_dataset, dataset_from_stream) assert data_samples == 10000 assert stream_from_cache.length == data_samples util.cleanup_lfs_storage(configurations=config, dataset_id=dataset_id, dataset_version=dataset_version)
def _decorated_fn(*args: Any, **kwargs: Any) -> Any: @self._storage.cacheable( # type: ignore dataset_id=dataset_id, dataset_version=dataset_version, ) def make_dataset() -> yogadl.DataRef: return make_dataset_fn(*args, **kwargs) logging.info( f"Preparing dataset: {dataset_id}:{dataset_version}.") logging.debug( f"Calling make dataset for: {dataset_id}:{dataset_version} " f"with following start_offset: {self._offset}, " f"shuffle: {shuffle} shuffle_seed: {self._shuffle_seed} " f"shard_rank: {self._shard_rank}, world size: {self._num_shards} " f"training: {self._training}.") stream_from_cache = make_dataset().stream( start_offset=self._offset, shuffle=shuffle, skip_shuffle_at_epoch_end=skip_shuffle_at_epoch_end, shuffle_seed=self._shuffle_seed, shard_rank=self._shard_rank, num_shards=self._num_shards, drop_shard_remainder=True if self._training else False, ) self._dataset_length = len(stream_from_cache) logging.info( f"Dataset {dataset_id}:{dataset_version} preparation finished." ) return tensorflow.make_tf_dataset(stream_from_cache)
def compare_performance_tf_record_dataset(data_dir: pathlib.Path) -> None: config = storage.LFSConfigurations(storage_dir_path="/tmp/") lfs_storage = storage.LFSStorage(configurations=config) dataset_id = "imagenet-train" dataset_version = "0" training = True cleanup_lfs_storage(configurations=config, dataset_id=dataset_id, dataset_version=dataset_version) @lfs_storage.cacheable(dataset_id=dataset_id, dataset_version=dataset_version) def make_dataset() -> dataref.LMDBDataRef: return make_dataset_from_tf_records(data_dir=data_dir, training=training) # type: ignore cache_creation_start_time = time.time() stream_from_cache = make_dataset().stream() cache_creation_time = time.time() - cache_creation_start_time print(f"Cache creation took: {cache_creation_time} seconds.") dataset_from_stream = tensorflow.make_tf_dataset(stream_from_cache) cache_read_time, cache_data_items = read_dataset( dataset=dataset_from_stream) print(f"Cache read took: {cache_read_time} seconds.") original_dataset_read_time, original_data_items = read_dataset( dataset=make_dataset_from_tf_records(data_dir=data_dir, training=training)) print(f"Original read took: {original_dataset_read_time} seconds.") assert cache_data_items == original_data_items
def worker_using_cacheable(config: storage.S3Configurations, dataset_id: str, dataset_version: str) -> None: s3_storage = storage.S3Storage(configurations=config) @s3_storage.cacheable(dataset_id=dataset_id, dataset_version=dataset_version) def make_dataset() -> dataref.LMDBDataRef: return util.make_mnist_test_dataset() # type: ignore stream_from_cache = make_dataset().stream() dataset_from_stream = tensorflow.make_tf_dataset(stream_from_cache) original_dataset = util.make_mnist_test_dataset() data_samples = util.compare_datasets(original_dataset, dataset_from_stream) assert data_samples == 10000 assert stream_from_cache.length == data_samples