def test_video(streaming, ctx): rng = np.random.RandomState(0) shape = (256, 64, 64, 3) video_data = rng.randint(0, 256, size=np.prod(shape), dtype=np.uint8).reshape(shape) with ctx() as path: with bf.BlobFile(path, mode="wb", streaming=streaming) as wf: with imageio.get_writer( wf, format="ffmpeg", quality=None, codec="libx264rgb", pixelformat="bgr24", output_params=["-f", "mp4", "-crf", "0"], ) as w: for frame in video_data: w.append_data(frame) with bf.BlobFile(path, mode="rb", streaming=streaming) as rf: with imageio.get_reader(rf, format="ffmpeg", input_params=["-f", "mp4"]) as r: for idx, frame in enumerate(r): assert np.array_equal(frame, video_data[idx]) with bf.BlobFile(path, mode="rb", streaming=streaming) as rf: container = av.open(rf) stream = container.streams.video[0] for idx, frame in enumerate(container.decode(stream)): assert np.array_equal(frame.to_image(), video_data[idx])
def main(): parser = argparse.ArgumentParser() parser.add_argument("--path", required=True) parser.add_argument("--no-streaming-read-request", action="store_true") parser.add_argument("--buffer-size", default=8192, type=int) parser.add_argument("--size", default=1_000_000_000, type=int) args = parser.parse_args() bf.configure(use_streaming_read_request=not args.no_streaming_read_request) path = bf.join(args.path, "large.bin") data = (b"meow" * 249 + b"mew\n") * (args.size // 1000) with timer("write_large_file"): with bf.BlobFile(path, "wb") as f: f.write(data) start = time.time() with timer("read_large_file"): with bf.BlobFile(path, "rb", buffer_size=args.buffer_size) as f: f.read() end = time.time() print(f"MB/s {len(data) /1e6/(end - start)}") with timer("read_large_file_lines"): with bf.BlobFile(path, "r", buffer_size=args.buffer_size) as f: for _ in f: pass with timer("seek_speed"): with bf.BlobFile(path, "rb", buffer_size=args.buffer_size) as f: for i in range(min(10_000, args.size)): f.seek(i) f.read(1)
def cache_folder(name, dirpath, options, build_fn): if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ: # we don't have any credentials to do the caching, always build in this case print(f"building without cache for {name}") start = time.time() build_fn() print(f"build elapsed {time.time() - start}") return options_hash = hashlib.md5("|".join(options).encode("utf8")).hexdigest() cache_path = bf.join(f"gs://{GCS_BUCKET}", "cache", f"{name}-{options_hash}.tar") if os.path.exists(dirpath): print(f"cache for {name} found locally") elif bf.exists(cache_path): print(f"downloading cache for {name}: {cache_path}") start = time.time() with bf.BlobFile(cache_path, "rb") as f: with tarfile.open(fileobj=f, mode="r") as tf: tf.extractall() print(f"download elapsed {time.time() - start}") else: print(f"building cache for {name}") start = time.time() build_fn() print(f"cache build elapsed {time.time() - start}") print(f"uploading cache for {name}") start = time.time() if not bf.exists(cache_path): with bf.BlobFile(cache_path, "wb") as f: with tarfile.open(fileobj=f, mode="w") as tf: tf.add(dirpath) print(f"upload elapsed {time.time() - start}")
async def test_copy(any_dir, other_any_dir): MIN_CHUNK_SIZE = 256 * 1024 with open("/dev/random", "rb") as f: contents_medium = f.read(16 * MIN_CHUNK_SIZE) helpers.create_file(any_dir / "original_medium", contents_medium) contents_known_small = b"abcdefgh" helpers.create_file(any_dir / "original_small", contents_known_small) async with bbb.BoostExecutor(100) as e: with bbb.globals.configure(chunk_size=MIN_CHUNK_SIZE): await bbb.copyfile(any_dir / "original_medium", other_any_dir / "copied_medium", e) with blobfile.BlobFile(str(other_any_dir / "copied_medium"), "rb") as f: assert f.read() == contents_medium await bbb.copyfile( any_dir / "original_small", other_any_dir / "copied_small", e, size=len(contents_known_small), ) with blobfile.BlobFile(str(other_any_dir / "copied_small"), "rb") as f: assert f.read() == contents_known_small
def test_large_file(ctx): contents = b"0" * 2**32 with ctx() as path: with bf.BlobFile(path, "wb", streaming=True) as f: f.write(contents) with bf.BlobFile(path, "rb", streaming=True) as f: assert contents == f.read()
def test_read_stats(buffer_size, ctx): with ctx() as path: contents = b"meow!" with bf.BlobFile(path, "wb") as w: w.write(contents) with bf.BlobFile(path, "rb", buffer_size=buffer_size) as r: r.read(1) if buffer_size == 1: assert r.raw.bytes_read == 1 # type: ignore else: assert r.raw.bytes_read == len(contents) # type: ignore with bf.BlobFile(path, "rb", buffer_size=buffer_size) as r: r.read(1) r.seek(4) r.read(1) r.seek(1000000) assert r.read(1) == b"" if buffer_size == 1: assert r.raw.requests == 2 # type: ignore assert r.raw.bytes_read == 2 # type: ignore else: assert r.raw.requests == 1 # type: ignore assert r.raw.bytes_read == len(contents) # type: ignore
def test_glob(ctx, parallel): contents = b"meow!" with ctx() as path: dirpath = bf.dirname(path) a_path = bf.join(dirpath, "ab") with bf.BlobFile(a_path, "wb") as w: w.write(contents) b_path = bf.join(dirpath, "bb") with bf.BlobFile(b_path, "wb") as w: w.write(contents) def assert_listing_equal(path, desired): desired = sorted([bf.join(dirpath, p) for p in desired]) actual = sorted(list(bf.glob(path, parallel=parallel))) assert actual == desired, f"{actual} != {desired}" assert_listing_equal(bf.join(dirpath, "*b"), ["ab", "bb"]) assert_listing_equal(bf.join(dirpath, "a*"), ["ab"]) assert_listing_equal(bf.join(dirpath, "ab*"), ["ab"]) assert_listing_equal(bf.join(dirpath, "*"), ["ab", "bb"]) assert_listing_equal(bf.join(dirpath, "bb"), ["bb"]) path = bf.join(dirpath, "test.txt") with bf.BlobFile(path, "wb") as w: w.write(contents) path = bf.join(dirpath, "subdir", "test.txt") bf.makedirs(bf.dirname(path)) with bf.BlobFile(path, "wb") as f: f.write(contents) path = bf.join(dirpath, "subdir", "subsubdir", "test.txt") if "://" not in path: # implicit directory bf.makedirs(bf.dirname(path)) with bf.BlobFile(path, "wb") as f: f.write(contents) assert_listing_equal(bf.join(dirpath, "*/test.txt"), ["subdir/test.txt"]) assert_listing_equal(bf.join(dirpath, "*/*.txt"), ["subdir/test.txt"]) if "://" in path: # local glob doesn't handle ** the same way as remote glob assert_listing_equal( bf.join(dirpath, "**.txt"), ["test.txt", "subdir/test.txt", "subdir/subsubdir/test.txt"], ) else: assert_listing_equal(bf.join(dirpath, "**.txt"), ["test.txt"]) assert_listing_equal(bf.join(dirpath, "*/test"), []) assert_listing_equal(bf.join(dirpath, "subdir/test.txt"), ["subdir/test.txt"]) # directories assert_listing_equal(bf.join(dirpath, "*"), ["ab", "bb", "subdir", "test.txt"]) assert_listing_equal(bf.join(dirpath, "subdir"), ["subdir"]) assert_listing_equal(bf.join(dirpath, "subdir/"), ["subdir"]) assert_listing_equal(bf.join(dirpath, "*/"), ["subdir"]) assert_listing_equal(bf.join(dirpath, "*dir"), ["subdir"]) assert_listing_equal(bf.join(dirpath, "subdir/*dir"), ["subdir/subsubdir"]) assert_listing_equal(bf.join(dirpath, "subdir/*dir/"), ["subdir/subsubdir"]) assert_listing_equal(bf.join(dirpath, "su*ir/*dir/"), ["subdir/subsubdir"])
def test_append(ctx): contents = b"meow!\n" additional_contents = b"purr\n" with ctx() as path: with bf.BlobFile(path, "ab", streaming=False) as w: w.write(contents) with bf.BlobFile(path, "ab", streaming=False) as w: w.write(additional_contents) with bf.BlobFile(path, "rb") as r: assert r.read() == contents + additional_contents
def test_read_write(ctx, streaming): contents = b"meow!\npurr\n" with ctx() as path: path = bf.join(path, "a folder", "a.file") bf.makedirs(bf.dirname(path)) with bf.BlobFile(path, "wb", streaming=streaming) as w: w.write(contents) with bf.BlobFile(path, "rb", streaming=streaming) as r: assert r.read() == contents with bf.BlobFile(path, "rb", streaming=streaming) as r: lines = list(r) assert b"".join(lines) == contents
def test_listdir(ctx): contents = b"meow!" with ctx() as path: dirpath = bf.dirname(path) a_path = bf.join(dirpath, "a") with bf.BlobFile(a_path, "wb") as w: w.write(contents) b_path = bf.join(dirpath, "b") with bf.BlobFile(b_path, "wb") as w: w.write(contents) bf.makedirs(bf.join(dirpath, "c")) assert sorted(list(bf.listdir(dirpath))) == ["a", "b", "c"]
def test_md5(ctx): contents = b"meow!" meow_hash = hashlib.md5(contents).hexdigest() with ctx() as path: _write_contents(path, contents) assert bf.md5(path) == meow_hash with bf.BlobFile(path, "wb") as f: f.write(contents) assert bf.md5(path) == meow_hash with bf.BlobFile(path, "wb") as f: f.write(contents) assert bf.md5(path) == meow_hash
def test_concurrent_write_gcs(): with _get_temp_gcs_path() as path: outer_contents = b"miso" * (2**20 + 1) inner_contents = b"momo" * (2**20 + 1) with bf.BlobFile(path, "wb", streaming=True) as f: f.write(outer_contents) with bf.BlobFile(path, "wb", streaming=True) as f: f.write(inner_contents) # the outer write will finish last and overwrite the inner one # the last writer to finish wins with this setup with bf.BlobFile(path, "rb") as f: assert f.read() == outer_contents
def test_rmtree(ctx): contents = b"meow!" with ctx() as path: root = bf.dirname(path) destroy_path = bf.join(root, "destroy") bf.makedirs(destroy_path) save_path = bf.join(root, "save") bf.makedirs(save_path) # implicit dir if not "://" in path: bf.makedirs(bf.join(destroy_path, "adir")) with bf.BlobFile(bf.join(destroy_path, "adir/b"), "wb") as w: w.write(contents) # explicit dir bf.makedirs(bf.join(destroy_path, "bdir")) with bf.BlobFile(bf.join(destroy_path, "bdir/b"), "wb") as w: w.write(contents) bf.makedirs(bf.join(save_path, "somedir")) with bf.BlobFile(bf.join(save_path, "somefile"), "wb") as w: w.write(contents) def assert_listing_equal(path, desired): actual = list(bf.walk(path)) # ordering of os walk is weird, only compare sorted order assert sorted(actual) == sorted(desired), f"{actual} != {desired}" assert_listing_equal( root, [ (root, ["destroy", "save"], []), (destroy_path, ["adir", "bdir"], []), (bf.join(destroy_path, "adir"), [], ["b"]), (bf.join(destroy_path, "bdir"), [], ["b"]), (save_path, ["somedir"], ["somefile"]), (bf.join(save_path, "somedir"), [], []), ], ) bf.rmtree(destroy_path) assert_listing_equal( root, [ (root, ["save"], []), (save_path, ["somedir"], ["somefile"]), (bf.join(save_path, "somedir"), [], []), ], )
def test_az_path(): contents = b"meow!\npurr\n" with _get_temp_as_path() as path: path = _convert_https_to_az(path) path = bf.join(path, "a folder", "a.file") path = _convert_https_to_az(path) bf.makedirs(_convert_https_to_az(bf.dirname(path))) with bf.BlobFile(path, "wb") as w: w.write(contents) with bf.BlobFile(path, "rb") as r: assert r.read() == contents with bf.BlobFile(path, "rb") as r: lines = list(r) assert b"".join(lines) == contents
def test_concurrent_write_as(): with _get_temp_as_path() as path: outer_contents = b"miso" * (2**20 + 1) inner_contents = b"momo" * (2**20 + 1) # the inner write will invalidate the outer one, the last writer # to start wins with this setup with pytest.raises(bf.ConcurrentWriteFailure): with bf.BlobFile(path, "wb", streaming=True) as f: f.write(outer_contents) with bf.BlobFile(path, "wb", streaming=True) as f: f.write(inner_contents) # the outer write will finish last and overwrite the inner one with bf.BlobFile(path, "rb") as f: assert f.read() == inner_contents
def save(self): def save_checkpoint(rate, params): state_dict = self.mp_trainer.master_params_to_state_dict(params) if dist.get_rank() == 0: logger.log(f"saving model {rate}...") if not rate: filename = f"model{(self.step+self.resume_step):06d}.pt" else: filename = f"ema_{rate}_{(self.step+self.resume_step):06d}.pt" with bf.BlobFile(bf.join(get_blob_logdir(), filename), "wb") as f: th.save(state_dict, f) save_checkpoint(0, self.mp_trainer.master_params) for rate, params in zip(self.ema_rate, self.ema_params): save_checkpoint(rate, params) if dist.get_rank() == 0: with bf.BlobFile( bf.join(get_blob_logdir(), f"opt{(self.step+self.resume_step):06d}.pt"), "wb", ) as f: th.save(self.opt.state_dict(), f) dist.barrier()
def test_overwrite_while_reading(ctx): chunk_size = 2**20 contents = b"\x00" * chunk_size * 2 alternative_contents = b"\xFF" * chunk_size * 4 with ctx() as path: with bf.BlobFile(path, "wb") as f: f.write(contents) with bf.BlobFile(path, "rb") as f: read_contents = f.read(chunk_size) with bf.BlobFile(path, "wb") as f2: f2.write(alternative_contents) # close underlying connection f.raw._f = None # type: ignore read_contents += f.read(chunk_size) assert (read_contents == contents[:chunk_size] + alternative_contents[chunk_size:chunk_size * 2])
def __getitem__(self, idx): path = self.local_images[idx] with bf.BlobFile(path, "rb") as f: pil_image = Image.open(f) pil_image.load() # We are not on a new enough PIL to support the `reducing_gap` # argument, which uses BOX downsampling at powers of two first. # Thus, we do it by hand to improve downsample quality. while min(*pil_image.size) >= 2 * self.resolution: pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size), resample=Image.BOX) scale = self.resolution / min(*pil_image.size) pil_image = pil_image.resize(tuple( round(x * scale) for x in pil_image.size), resample=Image.BICUBIC) arr = np.array(pil_image.convert("RGB")) crop_y = (arr.shape[0] - self.resolution) // 2 crop_x = (arr.shape[1] - self.resolution) // 2 arr = arr[crop_y:crop_y + self.resolution, crop_x:crop_x + self.resolution] arr = arr.astype(np.float32) / 127.5 - 1 out_dict = {} if self.local_classes is not None: out_dict["y"] = np.array(self.local_classes[idx], dtype=np.int64) return np.transpose(arr, [2, 0, 1]), out_dict
def loadnpy(url): import blobfile from io import BytesIO fp = blobfile.BlobFile(url, "rb") x = np.load(BytesIO(fp.read())) fp.close() return x
def all_examples(): for file_name in input_file_names: with bf.BlobFile(file_name, "r") as f: for line in f: encoded_example = json.loads(line) example = jsonl_encoding.decode_example(encoded_example) yield example
def test_composite_objects(): with _get_temp_gcs_path() as remote_path: with _get_temp_local_path() as local_path: contents = b"0" * 2 * 2**20 with open(local_path, "wb") as f: f.write(contents) sp.run( [ "gsutil", "-o", "GSUtil:parallel_composite_upload_threshold=1M", "cp", local_path, remote_path, ], check=True, ) assert hashlib.md5(contents).hexdigest() == bf.md5(remote_path) assert hashlib.md5(contents).hexdigest() == bf.md5(remote_path) with tempfile.TemporaryDirectory() as tmpdir: with bf.BlobFile(remote_path, "rb", cache_dir=tmpdir, streaming=False) as f: assert f.read() == contents
def __init__(self, bpe_path=None): if bpe_path == None: bpe_path = blobfile.BlobFile( 'https://openaipublic.blob.core.windows.net/clip/bpe_simple_vocab_16e6.txt', 'r') self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} merges = bpe_path.read().split('\n') merges = merges[1:49152 - 256 - 2 + 1] merges = [tuple(merge.split()) for merge in merges] vocab = list(bytes_to_unicode().values()) vocab = vocab + [v + '</w>' for v in vocab] for merge in merges: vocab.append(''.join(merge)) vocab.extend(['<|startoftext|>', '<|endoftext|>']) self.encoder = dict(zip(vocab, range(len(vocab)))) self.decoder = {v: k for k, v in self.encoder.items()} self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = { '<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>' } self.pat = re.compile( r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
def open_file_cached(path, mode="r"): """ Given a GCS path url, caches the contents locally. WARNING: only use this function if contents under the path won't change! """ with bf.BlobFile(path, mode=mode, cache_dir="/tmp/bf-file-cache", streaming=False) as f: yield f
def verify_hash(ref_hash, path): with bf.BlobFile(path, "rb") as f: m = hashlib.md5() while True: block = f.read(CHUNK_SIZE) if block == b"": break m.update(block) assert m.hexdigest() == ref_hash
def test_cache_dir(ctx): cache_dir = tempfile.mkdtemp() contents = b"meow!" alternative_contents = b"purr!" with ctx() as path: with bf.BlobFile(path, mode="wb") as f: f.write(contents) with bf.BlobFile(path, mode="rb", streaming=False, cache_dir=cache_dir) as f: assert f.read() == contents content_hash = hashlib.md5(contents).hexdigest() cache_path = bf.join(cache_dir, content_hash, bf.basename(path)) with open(cache_path, "rb") as f: assert f.read() == contents # alter the cached file to make sure we are not re-reading the remote file with open(cache_path, "wb") as f: f.write(alternative_contents) with bf.BlobFile(path, mode="rb", streaming=False, cache_dir=cache_dir) as f: assert f.read() == alternative_contents
def save_checkpoint(rate, params): state_dict = self._master_params_to_state_dict(params) if dist.get_rank() == 0: logger.log(f"saving model {rate}...") if not rate: filename = f"model{(self.step+self.resume_step):06d}.pt" else: filename = f"ema_{rate}_{(self.step+self.resume_step):06d}.pt" with bf.BlobFile(bf.join(get_blob_logdir(), filename), "wb") as f: th.save(state_dict, f)
def test_walk(ctx, topdown): contents = b"meow!" with ctx() as path: dirpath = bf.dirname(path) a_path = bf.join(dirpath, "a") with bf.BlobFile(a_path, "wb") as w: w.write(contents) bf.makedirs(bf.join(dirpath, "c/d")) b_path = bf.join(dirpath, "c/d/b") with bf.BlobFile(b_path, "wb") as w: w.write(contents) expected = [ (dirpath, ["c"], ["a"]), (bf.join(dirpath, "c"), ["d"], []), (bf.join(dirpath, "c", "d"), [], ["b"]), ] if not topdown: expected = list(reversed(expected)) assert list(bf.walk(dirpath, topdown=topdown)) == expected
def load_state_dict(path, **kwargs): """ Load a PyTorch file without redundant fetches across MPI ranks. """ if MPI.COMM_WORLD.Get_rank() == 0: with bf.BlobFile(path, "rb") as f: data = f.read() else: data = None data = MPI.COMM_WORLD.bcast(data) return th.load(io.BytesIO(data), **kwargs)
def test_scandir(ctx): contents = b"meow!" with ctx() as path: dirpath = bf.dirname(path) a_path = bf.join(dirpath, "a") with bf.BlobFile(a_path, "wb") as w: w.write(contents) b_path = bf.join(dirpath, "b") with bf.BlobFile(b_path, "wb") as w: w.write(contents) bf.makedirs(bf.join(dirpath, "c")) entries = sorted(list(bf.scandir(dirpath))) assert [e.name for e in entries] == ["a", "b", "c"] assert [e.path for e in entries ] == [bf.join(dirpath, name) for name in ["a", "b", "c"]] assert [e.is_dir for e in entries] == [False, False, True] assert [e.is_file for e in entries] == [True, True, False] assert entries[0].stat.size == len(contents) assert entries[1].stat.size == len(contents) assert entries[2].stat is None
def test_azure_metadata(ctx): # make sure metadata is preserved when opening a file for writing # which clears uncommitted blocks contents = b"meow!" with ctx() as path: with bf.BlobFile(path, "wb") as f: f.write(contents) bf.set_mtime(path, 1) _isfile, orig_metadata = ops._azure_isfile(path) time.sleep(5) with bf.BlobFile(path, "wb", streaming=True) as f: _isfile, new_metadata = ops._azure_isfile(path) keys = set(orig_metadata.keys()).union(new_metadata.keys()) for key in sorted(keys): orig_val = orig_metadata.get(key) new_val = new_metadata.get(key) if key not in ["Date", "ETag", "Last-Modified", "x-ms-request-id"]: assert orig_val == new_val