def bench_local(size, rep, fs, output, read_size=-1, read_len=None): block_size = -1 path = f"/dev/shm/rand{size}.out" if read_len is None: read_len = size if read_size == -1: read_size = size if "local" == fs: path = f"/home/ec2-user/rand{size}.out" # get file from aws sp.run(["aws", "s3", "cp", f"s3://{s3_path}{size}.out", path]) # clear caches helpers.drop_caches() # read file and store benchmark in variable start_open = perf_counter() with open(path, "rb") as f: end_open = perf_counter() end = read_chunks(f, read_size, read_len, fs, rep, size, block_size, output) write_benchmark(output, fs, rep, "total", size, end - start_open, block_size, read_size, read_len) write_benchmark(output, fs, rep, "open", size, end_open - start_open, block_size, read_size, read_len) # cleanup os.unlink(path)
def bench_aws(size, rep, output, block_size=None, read_size=-1,read_len=None): fs = "aws" if read_len is None: read_len = size if read_size == -1: read_size = size if block_size is None: block_size = size # clear caches helpers.drop_caches() s3 = s3fs.S3FileSystem() s3.invalidate_cache() start_open = perf_counter() with s3.open(f"{s3_path}{size}.out", "rb", block_size=block_size) as f: end_open = perf_counter() end = read_chunks(f, read_size, read_len, fs, rep, size, block_size, output) write_benchmark(output, fs, rep, "total", size, end - start_open, block_size, read_size, read_len) write_benchmark(output, fs, rep, "open", size, end_open - start_open, block_size, read_size, read_len)
def read_prefetched(path, lazy, block_size, prefetch_storage, bfile="read_file.bench", header_bytes=1000): helpers.drop_caches() fs = S3PrefetchFileSystem() fs.invalidate_cache() with fs.open(path, block_size=block_size, prefetch_storage=prefetch_storage, header_bytes=header_bytes) as f: data = read_trk(f, lazy, bfile=bfile) print(data)
def main(prefetch_storage, block_size, n_files, lazy, reps, types, bfile): types = list(types) header = ["vhs-bucket/hydi-header.trk"] fs = S3FileSystem() files = fs.glob("hydi-tractography/hydi_tracks.*.trk")[:n_files] results_path = "../results/us-west-2-xlarge/" if bfile == "": bfile = os.path.join( results_path, f"readnib_{n_files}f_{'lazy' if lazy else 'nolazy'}_{reps}r_{block_size}b.out", ) else: bfile = os.path.join(results_path, bfile) helpers.setup_bench(bfile) for _ in range(reps): random.shuffle(types) for t in types: print(t) helpers.drop_caches() if t == "mem": mem_files = [ os.path.join("/dev/shm", os.path.basename(p)) for p in files ] fs.get(files, mem_files) helpers.drop_caches() read_mem(mem_files, lazy, bfile=bfile) for p in mem_files: os.unlink(p) elif t == "prefetch": read_prefetched( header + files, lazy, block_size, prefetch_storage, bfile=bfile ) else: read_s3fs(files, lazy, block_size, bfile=bfile)
def main(): start = int(sys.argv[1]) end = int(sys.argv[2]) rep = int(sys.argv[3]) nthreads = int(sys.argv[4]) bs = 64 * 2 ** 20 lazy = True header = ["vhs-bucket/hydi-header.trk"] helpers.drop_caches() fs = S3FileSystem() files = fs.glob("hydi-tractography/hydi_tracks.*.trk")[start:end] bfile=f"../results/us-west-2-xlarge/read_s3fs_{nthreads}parallel_{start}-{end}_{rep}.csv" helpers.setup_bench(bfile) read_s3fs(files, lazy, bs, bfile=bfile)
def bench_prefetch(size, rep, output, block_size=None, prefetch_storage=[("/dev/shm", 5*1024**2)], read_size=-1, read_len=None): fs = "pf" if read_len is None: read_len = size if read_size == -1: read_size = size if block_size is None: block_size = size # clear caches helpers.drop_caches() s3 = S3PrefetchFileSystem() start_open = perf_counter() with s3.open(f"{s3_path}{size}.out", "rb", block_size=block_size, prefetch_storage=prefetch_storage) as f: end_open = perf_counter() end = read_chunks(f, read_size, read_len, fs, rep, size, block_size, output) write_benchmark(output, fs, rep, "total", size, end - start_open, block_size, read_size, read_len) write_benchmark(output, fs, rep, "open", size, end_open - start_open, block_size, read_size, read_len)
def main(): bfile = "../results/us-west-2-xlarge/readcmp-1-5f.out" reps = 5 n_files = 5 block_size = 64 * 2**20 prefetch_storage = [("/dev/shm", 1024)] lazy = False types = ["mem", "mem_nib", "s3fs", "s3fs_nib", "prefetch", "prefetch_nib"] fs = S3FileSystem() header = ["vhs-bucket/hydi-header.trk"] all_paths = fs.glob("hydi-tractography/hydi*") all_mem_paths = [ os.path.join("/dev/shm", os.path.basename(p)) for p in all_paths ] helpers.setup_bench(bfile) for _ in range(reps): for i in range(1, n_files + 1): paths = all_paths[0:i] mem_paths = all_mem_paths[0:i] random.shuffle(types) for t in types: helpers.drop_caches() if "mem" in t: fs.get(paths, mem_paths) helpers.drop_caches() if t == "mem": read_mem_bytes(mem_paths, lazy, bfile=bfile) else: read_mem_nib(mem_paths, lazy, bfile=bfile) for p in mem_paths: os.unlink(p) else: if "s3fs_nib" in t: read_s3fs_nib(paths, lazy, block_size, bfile=bfile) elif "s3fs" in t: read_s3fs_bytes(paths, lazy, block_size, bfile=bfile) elif "prefetch_nib" in t: read_prefetch_nib( header + paths, lazy, block_size, prefetch_storage, bfile=bfile, ) else: read_prefetch_bytes( header + paths, lazy, block_size, prefetch_storage, bfile=bfile, )
def main( file_type, prefetch_storage, block_size, n_files, reps, types, output_dir, nbins, dask, nworkers, instance, lazy, ): types = list(types) fs = S3FileSystem() if file_type == "orig": header = ["vhs-bucket/hydi-header.trk"] files = fs.glob("hydi-tractography/hydi_tracks.*.trk")[:n_files] else: header = ["vhs-bucket/shards/hydi_shard_header.trk"] files = fs.glob("vhs-bucket/shards/hydi_tracks.*.trk")[:n_files] results_path = op.join("../results/", instance) makedirs(results_path, exist_ok=True) bfile = op.join( results_path, f"histogram_{file_type}_{n_files}f_{reps}r_{block_size}b_{nbins}bins_{str(nworkers) + 'dask' if dask else 'seq'}_{'lazy' if lazy else 'nolazy'}.out", ) helpers.setup_bench(bfile) if dask: cluster = LocalCluster(n_workers=nworkers) client = Client(cluster) for r in range(reps): # random.shuffle(types) for t in types: print(t) helpers.drop_caches() if dask: results = [] if t == "s3fs": print(t) for i in range(nworkers): f_per_w = n_files // nworkers print(files[i * f_per_w:(i + 1) * f_per_w]) seg = client.submit( histogram_s3fs, files[i * f_per_w:(i + 1) * f_per_w], lazy, block_size, nbins=nbins, output_dir=output_dir, bfile=bfile, ) results.append(seg) else: print(t) for i in range(nworkers): f_per_w = n_files // nworkers print(files[i * f_per_w:(i + 1) * f_per_w]) seg = client.submit( histogram_prefetch, header + files[i * f_per_w:(i + 1) * f_per_w], lazy, block_size, prefetch_storage, nbins=nbins, output_dir=output_dir, bfile=bfile, ) results.append(seg) print(client.gather(results)) else: if t == "s3fs": histogram_s3fs( files, lazy, block_size, nbins=nbins, output_dir=output_dir, bfile=bfile, ) else: histogram_prefetch( header + files, lazy, block_size, prefetch_storage, nbins=nbins, output_dir=output_dir, bfile=bfile, )
def main(prefetch_storage, block_size, n_files, reps, types, nworkers): types = list(types) header = ["vhs-bucket/hydi-header.trk"] fs = S3FileSystem() files = fs.glob("hydi-tractography/hydi_tracks.*.trk")[:n_files] print(files) results_path = "../results/" bfile = op.join( results_path, f"real_{n_files}f_{reps}r_{block_size}b_{nworkers}w-recobundles.out", ) helpers.setup_bench(bfile) cluster = LocalCluster(n_workers=nworkers, resources={"CPU": 3}) client = Client(cluster) for r in range(reps): # random.shuffle(types) for t in types: print("***", t, "***") helpers.drop_caches() print(client) data = {} results = [] if t == "s3fs": print(t) for i in range(nworkers): f_per_w = n_files // nworkers print(files[i * f_per_w:(i + 1) * f_per_w]) seg = client.submit( segmentation_s3fs, files[i * f_per_w:(i + 1) * f_per_w], False, block_size, **data, bfile=bfile, ) results.append(seg) else: print(t) for i in range(nworkers): f_per_w = n_files // nworkers print(files[i * f_per_w:(i + 1) * f_per_w]) seg = client.submit( segmentation_prefetch, header + files[i * f_per_w:(i + 1) * f_per_w], False, block_size, prefetch_storage, **data, bfile=bfile, ) results.append(seg) print(client.gather(results)) system("pkill -f joblib")