Esempio n. 1
0
def bench_local(size, rep, fs, output, read_size=-1, read_len=None):

    block_size = -1
    path = f"/dev/shm/rand{size}.out"

    if read_len is None:
        read_len = size

    if read_size == -1:
        read_size = size

    if "local" == fs:
        path = f"/home/ec2-user/rand{size}.out"

    # get file from aws
    sp.run(["aws", "s3", "cp", f"s3://{s3_path}{size}.out", path])

    # clear caches
    helpers.drop_caches()

    # read file and store benchmark in variable
    start_open = perf_counter()
    with open(path, "rb") as f:
        end_open = perf_counter()

        end = read_chunks(f, read_size, read_len, fs, rep, size, block_size, output)

    write_benchmark(output, fs, rep, "total", size, end - start_open, block_size, read_size, read_len)
    write_benchmark(output, fs, rep, "open", size, end_open - start_open, block_size, read_size, read_len)

    # cleanup
    os.unlink(path)
Esempio n. 2
0
def bench_aws(size, rep, output, block_size=None, read_size=-1,read_len=None):
    fs = "aws"

    if read_len is None:
        read_len = size

    if read_size == -1:
        read_size = size

    if block_size is None:
        block_size = size

    # clear caches
    helpers.drop_caches()

    s3 = s3fs.S3FileSystem()
    s3.invalidate_cache()

    start_open = perf_counter()

    with s3.open(f"{s3_path}{size}.out", "rb", block_size=block_size) as f:
        end_open = perf_counter()
        end = read_chunks(f, read_size, read_len, fs, rep, size, block_size, output)

    write_benchmark(output, fs, rep, "total", size, end - start_open, block_size, read_size, read_len)
    write_benchmark(output, fs, rep, "open", size, end_open - start_open, block_size, read_size, read_len)
Esempio n. 3
0
def read_prefetched(path, lazy, block_size, prefetch_storage, bfile="read_file.bench", header_bytes=1000):
    helpers.drop_caches()
    fs = S3PrefetchFileSystem()
    fs.invalidate_cache()

    with fs.open(path, block_size=block_size, prefetch_storage=prefetch_storage, header_bytes=header_bytes) as f:
        data = read_trk(f, lazy, bfile=bfile)

    print(data)
Esempio n. 4
0
def main(prefetch_storage, block_size, n_files, lazy, reps, types, bfile):

    types = list(types)
    header = ["vhs-bucket/hydi-header.trk"]

    fs = S3FileSystem()
    files = fs.glob("hydi-tractography/hydi_tracks.*.trk")[:n_files]

    results_path = "../results/us-west-2-xlarge/"

    if bfile == "":
        bfile = os.path.join(
            results_path,
            f"readnib_{n_files}f_{'lazy' if lazy else 'nolazy'}_{reps}r_{block_size}b.out",
        )
    else:
        bfile = os.path.join(results_path, bfile)

    helpers.setup_bench(bfile)
    for _ in range(reps):

        random.shuffle(types)

        for t in types:

            print(t)
            helpers.drop_caches()

            if t == "mem":
                mem_files = [
                    os.path.join("/dev/shm", os.path.basename(p)) for p in files
                ]
                fs.get(files, mem_files)
                helpers.drop_caches()
                read_mem(mem_files, lazy, bfile=bfile)

                for p in mem_files:
                    os.unlink(p)

            elif t == "prefetch":
                read_prefetched(
                    header + files, lazy, block_size, prefetch_storage, bfile=bfile
                )

            else:
                read_s3fs(files, lazy, block_size, bfile=bfile)
Esempio n. 5
0
def main():
    
    start = int(sys.argv[1])
    end = int(sys.argv[2])
    rep = int(sys.argv[3])
    nthreads = int(sys.argv[4])

    bs = 64 * 2 ** 20
    lazy = True
    header = ["vhs-bucket/hydi-header.trk"]

    helpers.drop_caches()
    fs = S3FileSystem()
    files = fs.glob("hydi-tractography/hydi_tracks.*.trk")[start:end]
    bfile=f"../results/us-west-2-xlarge/read_s3fs_{nthreads}parallel_{start}-{end}_{rep}.csv"

    helpers.setup_bench(bfile)
    read_s3fs(files, lazy, bs, bfile=bfile)
Esempio n. 6
0
def bench_prefetch(size, rep, output, block_size=None, prefetch_storage=[("/dev/shm", 5*1024**2)], read_size=-1, read_len=None):
    fs = "pf"

    if read_len is None:
        read_len = size

    if read_size == -1:
        read_size = size

    if block_size is None:
        block_size = size

    # clear caches
    helpers.drop_caches()

    s3 = S3PrefetchFileSystem()

    start_open = perf_counter()
    with s3.open(f"{s3_path}{size}.out", "rb", block_size=block_size, prefetch_storage=prefetch_storage) as f:
        end_open = perf_counter()
        end = read_chunks(f, read_size, read_len, fs, rep, size, block_size, output)

    write_benchmark(output, fs, rep, "total", size, end - start_open, block_size, read_size, read_len)
    write_benchmark(output, fs, rep, "open", size, end_open - start_open, block_size, read_size, read_len)
Esempio n. 7
0
def main():

    bfile = "../results/us-west-2-xlarge/readcmp-1-5f.out"

    reps = 5
    n_files = 5
    block_size = 64 * 2**20
    prefetch_storage = [("/dev/shm", 1024)]
    lazy = False

    types = ["mem", "mem_nib", "s3fs", "s3fs_nib", "prefetch", "prefetch_nib"]
    fs = S3FileSystem()

    header = ["vhs-bucket/hydi-header.trk"]

    all_paths = fs.glob("hydi-tractography/hydi*")
    all_mem_paths = [
        os.path.join("/dev/shm", os.path.basename(p)) for p in all_paths
    ]

    helpers.setup_bench(bfile)
    for _ in range(reps):

        for i in range(1, n_files + 1):
            paths = all_paths[0:i]
            mem_paths = all_mem_paths[0:i]

            random.shuffle(types)

            for t in types:

                helpers.drop_caches()

                if "mem" in t:
                    fs.get(paths, mem_paths)
                    helpers.drop_caches()

                    if t == "mem":
                        read_mem_bytes(mem_paths, lazy, bfile=bfile)

                    else:
                        read_mem_nib(mem_paths, lazy, bfile=bfile)

                    for p in mem_paths:
                        os.unlink(p)

                else:
                    if "s3fs_nib" in t:
                        read_s3fs_nib(paths, lazy, block_size, bfile=bfile)
                    elif "s3fs" in t:
                        read_s3fs_bytes(paths, lazy, block_size, bfile=bfile)
                    elif "prefetch_nib" in t:
                        read_prefetch_nib(
                            header + paths,
                            lazy,
                            block_size,
                            prefetch_storage,
                            bfile=bfile,
                        )
                    else:
                        read_prefetch_bytes(
                            header + paths,
                            lazy,
                            block_size,
                            prefetch_storage,
                            bfile=bfile,
                        )
Esempio n. 8
0
def main(
    file_type,
    prefetch_storage,
    block_size,
    n_files,
    reps,
    types,
    output_dir,
    nbins,
    dask,
    nworkers,
    instance,
    lazy,
):

    types = list(types)

    fs = S3FileSystem()
    if file_type == "orig":
        header = ["vhs-bucket/hydi-header.trk"]
        files = fs.glob("hydi-tractography/hydi_tracks.*.trk")[:n_files]
    else:
        header = ["vhs-bucket/shards/hydi_shard_header.trk"]
        files = fs.glob("vhs-bucket/shards/hydi_tracks.*.trk")[:n_files]

    results_path = op.join("../results/", instance)

    makedirs(results_path, exist_ok=True)

    bfile = op.join(
        results_path,
        f"histogram_{file_type}_{n_files}f_{reps}r_{block_size}b_{nbins}bins_{str(nworkers) + 'dask' if dask else 'seq'}_{'lazy' if lazy else 'nolazy'}.out",
    )

    helpers.setup_bench(bfile)

    if dask:
        cluster = LocalCluster(n_workers=nworkers)
        client = Client(cluster)

    for r in range(reps):
        # random.shuffle(types)
        for t in types:
            print(t)
            helpers.drop_caches()

            if dask:

                results = []

                if t == "s3fs":
                    print(t)

                    for i in range(nworkers):
                        f_per_w = n_files // nworkers
                        print(files[i * f_per_w:(i + 1) * f_per_w])
                        seg = client.submit(
                            histogram_s3fs,
                            files[i * f_per_w:(i + 1) * f_per_w],
                            lazy,
                            block_size,
                            nbins=nbins,
                            output_dir=output_dir,
                            bfile=bfile,
                        )
                        results.append(seg)
                else:
                    print(t)

                    for i in range(nworkers):
                        f_per_w = n_files // nworkers
                        print(files[i * f_per_w:(i + 1) * f_per_w])
                        seg = client.submit(
                            histogram_prefetch,
                            header + files[i * f_per_w:(i + 1) * f_per_w],
                            lazy,
                            block_size,
                            prefetch_storage,
                            nbins=nbins,
                            output_dir=output_dir,
                            bfile=bfile,
                        )
                        results.append(seg)

                print(client.gather(results))
            else:
                if t == "s3fs":
                    histogram_s3fs(
                        files,
                        lazy,
                        block_size,
                        nbins=nbins,
                        output_dir=output_dir,
                        bfile=bfile,
                    )
                else:
                    histogram_prefetch(
                        header + files,
                        lazy,
                        block_size,
                        prefetch_storage,
                        nbins=nbins,
                        output_dir=output_dir,
                        bfile=bfile,
                    )
Esempio n. 9
0
def main(prefetch_storage, block_size, n_files, reps, types, nworkers):

    types = list(types)
    header = ["vhs-bucket/hydi-header.trk"]

    fs = S3FileSystem()
    files = fs.glob("hydi-tractography/hydi_tracks.*.trk")[:n_files]
    print(files)

    results_path = "../results/"

    bfile = op.join(
        results_path,
        f"real_{n_files}f_{reps}r_{block_size}b_{nworkers}w-recobundles.out",
    )

    helpers.setup_bench(bfile)

    cluster = LocalCluster(n_workers=nworkers, resources={"CPU": 3})

    client = Client(cluster)

    for r in range(reps):
        # random.shuffle(types)
        for t in types:
            print("***", t, "***")
            helpers.drop_caches()

            print(client)

            data = {}
            results = []

            if t == "s3fs":
                print(t)

                for i in range(nworkers):
                    f_per_w = n_files // nworkers
                    print(files[i * f_per_w:(i + 1) * f_per_w])
                    seg = client.submit(
                        segmentation_s3fs,
                        files[i * f_per_w:(i + 1) * f_per_w],
                        False,
                        block_size,
                        **data,
                        bfile=bfile,
                    )
                    results.append(seg)
            else:
                print(t)

                for i in range(nworkers):
                    f_per_w = n_files // nworkers
                    print(files[i * f_per_w:(i + 1) * f_per_w])
                    seg = client.submit(
                        segmentation_prefetch,
                        header + files[i * f_per_w:(i + 1) * f_per_w],
                        False,
                        block_size,
                        prefetch_storage,
                        **data,
                        bfile=bfile,
                    )
                    results.append(seg)

            print(client.gather(results))
            system("pkill -f joblib")