Ejemplo n.º 1
0
def run_HPC():
        
    #################
    # Setup dask cluster
    #################
    
    config = utils.read_config()
    num_workers = config["num_hipergator_workers"]
    
    #job args
    extra_args=[
        "--error=/home/b.weinstein/logs/dask-worker-%j.err",
        "--account=ewhite",
        "--output=/home/b.weinstein/logs/dask-worker-%j.out"
    ]
    
    cluster = SLURMCluster(
        processes=2,
        queue='hpg2-compute',
        cores=3, 
        memory='11GB', 
        walltime='24:00:00',
        job_extra=extra_args,
        local_directory="/home/b.weinstein/logs/", death_timeout=150)
    
    print(cluster.job_script())
    cluster.adapt(minimum=num_workers, maximum=num_workers)
    
    dask_client = Client(cluster)
    
    #Start dask
    dask_client.run_on_scheduler(start_tunnel)  
    run(config, debug=False)
Ejemplo n.º 2
0
def start_dask_cluster(number_of_workers, mem_size="10GB"):

    #################
    # Setup dask cluster
    #################

    #job args
    extra_args = [
        "--error=/home/b.weinstein/logs/dask-worker-%j.err",
        "--account=ewhite",
        "--output=/home/b.weinstein/logs/dask-worker-%j.out"
    ]

    cluster = SLURMCluster(processes=1,
                           queue='hpg2-compute',
                           cores=1,
                           memory=mem_size,
                           walltime='24:00:00',
                           job_extra=extra_args,
                           local_directory="/home/b.weinstein/logs/dask/",
                           death_timeout=300)

    print(cluster.job_script())
    cluster.adapt(minimum=number_of_workers, maximum=number_of_workers)

    dask_client = Client(cluster)

    #Start dask
    dask_client.run_on_scheduler(start_tunnel)

    return dask_client
Ejemplo n.º 3
0
def cli(scheduler_file, jlab_port, dash_port, notebook_dir, hostname,
        log_level):
    logger = get_logger(log_level)

    logger.info('getting client with scheduler file: %s' % scheduler_file)
    client = Client(scheduler_file=scheduler_file, timeout=30)
    logger.debug('Client: %s' % client)

    logger.debug('Getting hostname where scheduler is running')
    host = client.run_on_scheduler(socket.gethostname)
    logger.info('host is %s' % host)

    logger.info('Starting jupyter lab on host')
    client.run_on_scheduler(start_jlab,
                            host=host,
                            port=jlab_port,
                            notebook_dir=notebook_dir)
    logger.debug('Done.')

    user = os.environ['USER']
    print('Run the following command from your local machine:')
    print('ssh -N -L {}:{}:{} -L {}:{}:8787 {}@{}'.format(
        jlab_port, host, jlab_port, dash_port, host, user, hostname))
    print('Then open the following URLs:')
    print('\tJupyter lab: http://localhost:{}'.format(jlab_port))
    print('\tDask dashboard: http://localhost:{}'.format(dash_port))
Ejemplo n.º 4
0
def reload_modules_on_workers(url, modulelist=None):
    """Run reload(module) on the items in the modulelist"""
    client = Client(url)
    for mod in modulelist:
        print("reloading %s" % mod)
        client.run(importlib.reload, mod)
        client.run_on_scheduler(importlib.reload, mod)
Ejemplo n.º 5
0
def start(cpus=0, gpus=0, mem_size="10GB"):
    #################
    # Setup dask cluster
    #################

    if cpus > 0:
        #job args
        extra_args = [
            "--error=/orange/idtrees-collab/logs/dask-worker-%j.err",
            "--account=ewhite",
            "--output=/orange/idtrees-collab/logs/dask-worker-%j.out"
        ]

        cluster = SLURMCluster(
            processes=1,
            queue='hpg2-compute',
            cores=1,
            memory=mem_size,
            walltime='24:00:00',
            job_extra=extra_args,
            extra=['--resources cpu=1'],
            scheduler_options={"dashboard_address": ":8781"},
            local_directory="/orange/idtrees-collab/tmp/",
            death_timeout=300)

        print(cluster.job_script())
        cluster.scale(cpus)

    if gpus:
        #job args
        extra_args = [
            "--error=/orange/idtrees-collab/logs/dask-worker-%j.err",
            "--account=ewhite",
            "--output=/orange/idtrees-collab/logs/dask-worker-%j.out",
            "--partition=gpu", "--gpus=1"
        ]

        cluster = SLURMCluster(
            processes=1,
            cores=1,
            memory=mem_size,
            walltime='24:00:00',
            job_extra=extra_args,
            extra=['--resources gpu=1'],
            scheduler_options={"dashboard_address": ":8787"},
            local_directory="/orange/idtrees-collab/tmp/",
            death_timeout=300)

        cluster.scale(gpus)

    dask_client = Client(cluster)

    #Start dask
    dask_client.run_on_scheduler(start_tunnel)

    return dask_client
Ejemplo n.º 6
0
def install_libraries_on_workers(url, runlist=None):
    """Install libraries if necessary on workers etc.
    e.g. if already on server...
    install_libraries_on_workers('127.0.0.1:8786')
    """

    client = Client(url)

    if runlist is None:
        runlist = [
            'sudo apt-get -y install build-essential', 'pip install -U pip',
            'sudo apt install libgl1-mesa-glx -y', 'conda update scipy -y',
            'pip install git+https://github.com/sods/paramz.git',
            'pip install git+https://github.com/SheffieldML/GPy.git',
            'pip install git+https://github.com/lionfish0/dp4gp.git',
            'conda install dask-searchcv -c conda-forge -y',
            'pip install git+https://github.com/lionfish0/dask_dp4gp.git',
            'pip install numpy', 'conda remove argcomplete -y',
            'pip install git+https://github.com/lionfish0/dialysis_analysis.git --upgrade'
        ]  #, 'conda install python=3.6 -y']

    for item in runlist:
        print("Installing '%s' on workers..." % item)
        res = client.run(os.system, item)
        print(res)
        print("Installing '%s' on scheduler..." % item)
        res = client.run_on_scheduler(os.system, item)
        print(res)
Ejemplo n.º 7
0
def run_HPC(data_paths):

    #################
    # Setup dask cluster
    #################

    from dask_jobqueue import SLURMCluster
    from dask.distributed import Client, wait

    DeepForest_config = config.load_config()
    num_workers = DeepForest_config["num_hipergator_workers"]

    #job args
    extra_args = [
        "--error=/home/b.weinstein/logs/dask-worker-%j.err",
        "--account=ewhite",
        "--output=/home/b.weinstein/logs/dask-worker-%j.out"
    ]

    cluster = SLURMCluster(processes=1,
                           queue='hpg2-compute',
                           cores=1,
                           memory='13GB',
                           walltime='24:00:00',
                           job_extra=extra_args,
                           local_directory="/home/b.weinstein/logs/",
                           death_timeout=300)

    print(cluster.job_script())
    cluster.adapt(minimum=num_workers, maximum=num_workers)

    dask_client = Client(cluster)

    #Start dask
    dask_client.run_on_scheduler(start_tunnel)

    for site in data_paths:
        futures = dask_client.map(Generate.run,
                                  data_paths[site],
                                  site=site,
                                  DeepForest_config=DeepForest_config)
        wait(futures)
        print("{} complete".format(site))

    print("All sites complete")
Ejemplo n.º 8
0
def cli(scheduler_file, jlab_port, dash_port, notebook_dir,
        hostname, log_level):
    logger = get_logger(log_level)

    logger.info('getting client with scheduler file: %s' % scheduler_file)
    client = Client(scheduler_file=scheduler_file, timeout=30)
    logger.debug('Client: %s' % client)

    logger.debug('Getting hostname where scheduler is running')
    host = client.run_on_scheduler(socket.gethostname)
    logger.info('host is %s' % host)

    logger.info('Starting jupyter lab on host')
    client.run_on_scheduler(start_jlab, host=host, port=jlab_port,
                            notebook_dir=notebook_dir)
    logger.debug('Done.')

    user = os.environ['USER']
    print('Run the following command from your local machine:')
    #print('ssh -N -L {}:{}:{} -L {}:{}:8787 {}@{}'.format(jlab_port, host, jlab_port, dash_port, host, user, hostname))
    print('ssh -N -L {}:{}:{} -L {}:{}:8787 {}'.format(jlab_port, host, jlab_port, dash_port, host,  hostname)) #Modification for existing ssh key
    print('Then open the following URLs:')
    print('\tJupyter lab: http://localhost:{}'.format(jlab_port))
    print('\tDask dashboard: http://localhost:{}'.format(dash_port))
Ejemplo n.º 9
0
    #print (Client(Scheduler_IP))
    c = Client(processes=False,
               threads_per_worker=4,
               n_workers=1,
               memory_limit='2GB')
    FramesBase = 4187

    with open('data3.txt', mode='w') as file:
        traj_size = [600]
        for k in traj_size:  # we have 3 trajectory sizes
            block_size = [144]
            for i in block_size:  # changing blocks
                for j in range(1,
                               40):  # changing files (5 files per block size)
                    c.run_on_scheduler(
                        submitCustomProfiler,
                        '/data/03170/tg824689/BecksteinLab/scripts-DCD/stragglers_test_%d_%d_%d.txt'
                        % (k, i, j))
                    # Provide the path to my file to all processes
                    total = com_parallel_dask_distributed(FramesBase * k, i)
                    total = delayed(total)
                    start = time.time()
                    output = total.compute(scheduler=c.get)
                    total.visualize(filename='transpose.svg')
                    tot_time = time.time() - start
                    c.run_on_scheduler(removeCustomProfiler)
                    file.write("DCD{} {} {} {} {} {} {} {}\n".format(
                        k, i, j, output[1], output[2], output[3], output[4],
                        tot_time))
                    file.flush()
Ejemplo n.º 10
0
def main(args):
    cluster_options = get_cluster_options(args)
    Cluster = cluster_options["class"]
    cluster_args = cluster_options["args"]
    cluster_kwargs = cluster_options["kwargs"]
    scheduler_addr = cluster_options["scheduler_addr"]

    if args.sched_addr:
        client = Client(args.sched_addr)
    else:
        filterwarnings("ignore",
                       message=".*NVLink.*rmm_pool_size.*",
                       category=UserWarning)

        cluster = Cluster(*cluster_args, **cluster_kwargs)
        if args.multi_node:
            import time

            # Allow some time for workers to start and connect to scheduler
            # TODO: make this a command-line argument?
            time.sleep(15)

        client = Client(scheduler_addr if args.multi_node else cluster)

    if args.type == "gpu":
        client.run(
            setup_memory_pool,
            pool_size=args.rmm_pool_size,
            disable_pool=args.disable_rmm_pool,
            log_directory=args.rmm_log_directory,
        )
        # Create an RMM pool on the scheduler due to occasional deserialization
        # of CUDA objects. May cause issues with InfiniBand otherwise.
        client.run_on_scheduler(
            setup_memory_pool,
            pool_size=1e9,
            disable_pool=args.disable_rmm_pool,
            log_directory=args.rmm_log_directory,
        )

    scheduler_workers = client.run_on_scheduler(get_scheduler_workers)
    n_workers = len(scheduler_workers)
    client.wait_for_workers(n_workers)

    # Allow the number of chunks to vary between
    # the "base" and "other" DataFrames
    args.base_chunks = args.base_chunks or n_workers
    args.other_chunks = args.other_chunks or n_workers

    if args.all_to_all:
        all_to_all(client)

    took_list = []
    for _ in range(args.runs - 1):
        took_list.append(run(client, args, n_workers, write_profile=None))
    took_list.append(
        run(client, args, n_workers,
            write_profile=args.profile))  # Only profiling the last run

    # Collect, aggregate, and print peer-to-peer bandwidths
    incoming_logs = client.run(
        lambda dask_worker: dask_worker.incoming_transfer_log)
    bandwidths = defaultdict(list)
    total_nbytes = defaultdict(list)
    for k, L in incoming_logs.items():
        for d in L:
            if d["total"] >= args.ignore_size:
                bandwidths[k, d["who"]].append(d["bandwidth"])
                total_nbytes[k, d["who"]].append(d["total"])
    bandwidths = {(scheduler_workers[w1].name, scheduler_workers[w2].name): [
        "%s/s" % format_bytes(x)
        for x in numpy.quantile(v, [0.25, 0.50, 0.75])
    ]
                  for (w1, w2), v in bandwidths.items()}
    total_nbytes = {(
        scheduler_workers[w1].name,
        scheduler_workers[w2].name,
    ): format_bytes(sum(nb))
                    for (w1, w2), nb in total_nbytes.items()}

    broadcast = (False if args.shuffle_join else
                 (True if args.broadcast_join else "default"))

    t_runs = numpy.empty(len(took_list))
    if args.markdown:
        print("```")
    print("Merge benchmark")
    print("-------------------------------")
    print(f"backend        | {args.backend}")
    print(f"merge type     | {args.type}")
    print(f"rows-per-chunk | {args.chunk_size}")
    print(f"base-chunks    | {args.base_chunks}")
    print(f"other-chunks   | {args.other_chunks}")
    print(f"broadcast      | {broadcast}")
    print(f"protocol       | {args.protocol}")
    print(f"device(s)      | {args.devs}")
    print(f"rmm-pool       | {(not args.disable_rmm_pool)}")
    print(f"frac-match     | {args.frac_match}")
    if args.protocol == "ucx":
        print(f"tcp            | {args.enable_tcp_over_ucx}")
        print(f"ib             | {args.enable_infiniband}")
        print(f"nvlink         | {args.enable_nvlink}")
    print(f"data-processed | {format_bytes(took_list[0][0])}")
    print("===============================")
    print("Wall-clock     | Throughput")
    print("-------------------------------")
    for idx, (data_processed, took) in enumerate(took_list):
        throughput = int(data_processed / took)
        m = format_time(took)
        m += " " * (15 - len(m))
        print(f"{m}| {format_bytes(throughput)}/s")
        t_runs[idx] = float(format_bytes(throughput).split(" ")[0])
    print("===============================")
    if args.markdown:
        print("\n```")

    if args.plot is not None:
        plot_benchmark(t_runs, args.plot, historical=True)

    if args.backend == "dask":
        if args.markdown:
            print(
                "<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```"
            )
        print("(w1,w2)     | 25% 50% 75% (total nbytes)")
        print("-------------------------------")
        for (d1, d2), bw in sorted(bandwidths.items()):
            fmt = ("(%s,%s)     | %s %s %s (%s)" if args.multi_node
                   or args.sched_addr else "(%02d,%02d)     | %s %s %s (%s)")
            print(fmt % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]))
        if args.markdown:
            print("```\n</details>\n")

    if args.multi_node:
        client.shutdown()
        client.close()
Ejemplo n.º 11
0
def main(args):
    # Set up workers on the local machine
    if args.protocol == "tcp":
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devs,
        )
    else:
        enable_infiniband = args.enable_infiniband
        enable_nvlink = args.enable_nvlink
        enable_tcp_over_ucx = args.enable_tcp_over_ucx
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devs,
            ucx_net_devices="auto",
            enable_tcp_over_ucx=enable_tcp_over_ucx,
            enable_infiniband=enable_infiniband,
            enable_nvlink=enable_nvlink,
        )
        initialize(
            create_cuda_context=True,
            enable_tcp_over_ucx=enable_tcp_over_ucx,
            enable_infiniband=enable_infiniband,
            enable_nvlink=enable_nvlink,
        )
    client = Client(cluster)

    def _worker_setup(initial_pool_size=None):
        import rmm

        rmm.reinitialize(
            pool_allocator=not args.no_rmm_pool,
            devices=0,
            initial_pool_size=initial_pool_size,
        )
        cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)

    client.run(_worker_setup)
    # Create an RMM pool on the scheduler due to occasional deserialization
    # of CUDA objects. May cause issues with InfiniBand otherwise.
    client.run_on_scheduler(_worker_setup, 1e9)

    took_list = []
    for _ in range(args.runs - 1):
        took_list.append(run(client, args, write_profile=None))
    took_list.append(
        run(client, args, write_profile=args.profile)
    )  # Only profiling the last run

    # Collect, aggregate, and print peer-to-peer bandwidths
    incoming_logs = client.run(lambda dask_worker: dask_worker.incoming_transfer_log)
    bandwidths = defaultdict(list)
    total_nbytes = defaultdict(list)
    for k, L in incoming_logs.items():
        for d in L:
            if d["total"] >= args.ignore_size:
                bandwidths[k, d["who"]].append(d["bandwidth"])
                total_nbytes[k, d["who"]].append(d["total"])
    bandwidths = {
        (cluster.scheduler.workers[w1].name, cluster.scheduler.workers[w2].name): [
            "%s/s" % format_bytes(x) for x in numpy.quantile(v, [0.25, 0.50, 0.75])
        ]
        for (w1, w2), v in bandwidths.items()
    }
    total_nbytes = {
        (
            cluster.scheduler.workers[w1].name,
            cluster.scheduler.workers[w2].name,
        ): format_bytes(sum(nb))
        for (w1, w2), nb in total_nbytes.items()
    }

    if args.markdown:
        print("```")
    print("Merge benchmark")
    print("-------------------------------")
    print(f"backend        | {args.backend}")
    print(f"rows-per-chunk | {args.chunk_size}")
    print(f"protocol       | {args.protocol}")
    print(f"device(s)      | {args.devs}")
    print(f"rmm-pool       | {(not args.no_rmm_pool)}")
    print(f"frac-match     | {args.frac_match}")
    if args.protocol == "ucx":
        print(f"tcp            | {args.enable_tcp_over_ucx}")
        print(f"ib             | {args.enable_infiniband}")
        print(f"nvlink         | {args.enable_nvlink}")
    print(f"data-processed | {format_bytes(took_list[0][0])}")
    print("===============================")
    print("Wall-clock     | Throughput")
    print("-------------------------------")
    for data_processed, took in took_list:
        throughput = int(data_processed / took)
        m = format_time(took)
        m += " " * (15 - len(m))
        print(f"{m}| {format_bytes(throughput)}/s")
    print("===============================")
    if args.markdown:
        print("\n```")

    if args.backend == "dask":
        if args.markdown:
            print("<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```")
        print("(w1,w2)     | 25% 50% 75% (total nbytes)")
        print("-------------------------------")
        for (d1, d2), bw in sorted(bandwidths.items()):
            print(
                "(%02d,%02d)     | %s %s %s (%s)"
                % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)])
            )
        if args.markdown:
            print("```\n</details>\n")
Ejemplo n.º 12
0
with open('data.txt', mode='a') as file:
    traj_size = [600]
    for k in traj_size: # we have 3 trajectory sizes
        # Creating the universe for doing benchmark
        u1 = mda.Universe(PSF, DCD1)
    
        longXTC = os.path.abspath(os.path.normpath(os.path.join(os.getcwd(),'files/newtraj.xtc')))

        # Doing benchmarks
        ii=1
        block_size = [1 6 12 18 24 30 36 42 48 54 60 66 72]
        for i in block_size:      # changing blocks
            for j in range(1,6):    # changing files (5 files per block size)
                # Create a new filei
                c.run_on_scheduler(submitCustomProfiler,os.path.abspath(os.path.normpath(os.path.join(os.getcwd(),'files/XTC_{}_{}_{}.txt'.format(k,i,j)))))
                longXTC1 = os.path.abspath(os.path.normpath(os.path.join(os.getcwd(),'files/newtraj{}.xtc'.format(ii))))
                copyfile(longXTC, longXTC1)
                # Provide the path to my file to all processes
                my_path = os.path.normpath(os.path.join(os.getcwd(), longXTC1))
                longXTC1 = os.path.abspath(my_path)
                # Define a new universe with the new trajectory
                u = mda.Universe(PSF, longXTC1)
                print(u)
                print("frames in trajectory ", u.trajectory.n_frames)
                print (len(u.trajectory))
                mobile = u.select_atoms("(resid 1:29 or resid 60:121 or resid 160:214) and name CA")
                index = mobile.indices
                total = com_parallel_dask_distributed(mobile, index, i)
                total = delayed (total)
                start = time.time()
Ejemplo n.º 13
0
if __name__ == '__main__':
    Scheduler_IP = sys.argv[1]
    #SLURM_JOBID = sys.argv[2]
    print(Scheduler_IP)
    #print (Client(Scheduler_IP))
    c = Client(Scheduler_IP)

    with open('data3.txt', mode='w') as file:
        traj_size = [600]
        for k in traj_size:  # we have 3 trajectory sizes
            block_size = [int(sys.argv[2])]
            for i in block_size:  # changing blocks
                for j in range(10):  # changing files (5 files per block size)
                    c.run_on_scheduler(
                        submitCustomProfiler, sys.argv[3] +
                        '/stragglers_test_%d_%d_%d.txt' % (k, i, j))
                    # Provide the path to my file to all processes
                    total = com_parallel_dask_distributed(104675 * i, i)
                    total = delayed(total)
                    start = time.time()
                    output = total.compute(get=c.get)
                    tot_time = time.time() - start
                    c.run_on_scheduler(removeCustomProfiler)
                    file.write(
                        'size,blocks,iter,t_comp_avg,t_comp_max,t_all_frame_avg,t_all_frame_max,tot_time'
                    )
                    file.write("{0},{1},{2},{3},{4},{5},{6},{7}\n".format(
                        k, i, j, output[1], output[2], output[3], output[4],
                        tot_time))
                    file.flush()
Ejemplo n.º 14
0
cluster = SLURMCluster(project='ewhite', death_timeout=100)
cluster.start_workers(1)

print(cluster.job_script())

from dask.distributed import Client
client = Client(cluster)

client

counter = 0
while counter < 10:
    print(datetime.now().strftime("%a, %d %B %Y %I:%M:%S"))
    print(client)
    sleep(20)
    counter += 1

import socket
host = client.run_on_scheduler(socket.gethostname)


def start_jlab(dask_scheduler):
    import subprocess
    proc = subprocess.Popen(['jupyter', 'lab', '--ip', host, '--no-browser'])
    dask_scheduler.jlab_proc = proc


client.run_on_scheduler(start_jlab)

print("ssh -N -L 8787:%s:8787 -L 8888:%s:8888 -l b.weinstein hpg2.rc.ufl.edu" %
      (host, host))
Ejemplo n.º 15
0
from dask_mpi import initialize

initialize()

import socket

from distributed.scheduler import logger

import dask.array as da
from dask.distributed import Client

client = Client()  # Connect this local process to remote workers

host = client.run_on_scheduler(socket.gethostname)
port = client.scheduler_info()['services']['dashboard']
login_node_address = (
    'supercomputer.university.edu'  # Change this to the address/domain of your login node
)

logger.info(f'ssh -N -L {port}:{host}:{port} {login_node_address}')

logger.info('HELLO' * 10)
print('WORLD' * 10)

x = da.random.random((200, 10_000, 5_000), chunks=(20, 1_000, 1_000))
y = x.std(axis=0)
y = y.compute()
print(y)
Ejemplo n.º 16
0
from dask.distributed import Client
import socket

client = Client(scheduler_file='scheduler.json')
print(client)

host = client.run_on_scheduler(socket.gethostname)


def start_jlab(dask_scheduler):
    import subprocess
    proc = subprocess.Popen(['jupyter', 'notebook', '--ip', host])
    dask_scheduler.jlab_proc = proc


client.run_on_scheduler(start_jlab)

print("HOST : %s" % host)
Ejemplo n.º 17
0
def main(args):
    cluster_options = get_cluster_options(args)
    Cluster = cluster_options["class"]
    cluster_args = cluster_options["args"]
    cluster_kwargs = cluster_options["kwargs"]
    scheduler_addr = cluster_options["scheduler_addr"]

    cluster = Cluster(*cluster_args, **cluster_kwargs)
    if args.multi_node:
        import time

        # Allow some time for workers to start and connect to scheduler
        # TODO: make this a command-line argument?
        time.sleep(15)

    client = Client(scheduler_addr if args.multi_node else cluster)

    client.run(setup_memory_pool, disable_pool=args.no_rmm_pool)
    # Create an RMM pool on the scheduler due to occasional deserialization
    # of CUDA objects. May cause issues with InfiniBand otherwise.
    client.run_on_scheduler(setup_memory_pool, 1e9, disable_pool=args.no_rmm_pool)

    scheduler_workers = client.run_on_scheduler(get_scheduler_workers)
    n_workers = len(scheduler_workers)

    took_list = []
    for _ in range(args.runs - 1):
        took_list.append(run(client, args, n_workers, write_profile=None))
    took_list.append(
        run(client, args, n_workers, write_profile=args.profile)
    )  # Only profiling the last run

    # Collect, aggregate, and print peer-to-peer bandwidths
    incoming_logs = client.run(lambda dask_worker: dask_worker.incoming_transfer_log)
    bandwidths = defaultdict(list)
    total_nbytes = defaultdict(list)
    for k, L in incoming_logs.items():
        for d in L:
            if d["total"] >= args.ignore_size:
                bandwidths[k, d["who"]].append(d["bandwidth"])
                total_nbytes[k, d["who"]].append(d["total"])
    bandwidths = {
        (scheduler_workers[w1].name, scheduler_workers[w2].name): [
            "%s/s" % format_bytes(x) for x in numpy.quantile(v, [0.25, 0.50, 0.75])
        ]
        for (w1, w2), v in bandwidths.items()
    }
    total_nbytes = {
        (scheduler_workers[w1].name, scheduler_workers[w2].name,): format_bytes(sum(nb))
        for (w1, w2), nb in total_nbytes.items()
    }

    if args.markdown:
        print("```")
    print("Merge benchmark")
    print("-------------------------------")
    print(f"backend        | {args.backend}")
    print(f"rows-per-chunk | {args.chunk_size}")
    print(f"protocol       | {args.protocol}")
    print(f"device(s)      | {args.devs}")
    print(f"rmm-pool       | {(not args.no_rmm_pool)}")
    print(f"frac-match     | {args.frac_match}")
    if args.protocol == "ucx":
        print(f"tcp            | {args.enable_tcp_over_ucx}")
        print(f"ib             | {args.enable_infiniband}")
        print(f"nvlink         | {args.enable_nvlink}")
    print(f"data-processed | {format_bytes(took_list[0][0])}")
    print("===============================")
    print("Wall-clock     | Throughput")
    print("-------------------------------")
    for data_processed, took in took_list:
        throughput = int(data_processed / took)
        m = format_time(took)
        m += " " * (15 - len(m))
        print(f"{m}| {format_bytes(throughput)}/s")
    print("===============================")
    if args.markdown:
        print("\n```")

    if args.backend == "dask":
        if args.markdown:
            print("<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```")
        print("(w1,w2)     | 25% 50% 75% (total nbytes)")
        print("-------------------------------")
        for (d1, d2), bw in sorted(bandwidths.items()):
            fmt = (
                "(%s,%s)     | %s %s %s (%s)"
                if args.multi_node
                else "(%02d,%02d)     | %s %s %s (%s)"
            )
            print(fmt % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]))
        if args.markdown:
            print("```\n</details>\n")

    if args.multi_node:
        client.shutdown()
        client.close()
Ejemplo n.º 18
0
def main(args):
    cluster_options = get_cluster_options(args)
    Cluster = cluster_options["class"]
    cluster_args = cluster_options["args"]
    cluster_kwargs = cluster_options["kwargs"]
    scheduler_addr = cluster_options["scheduler_addr"]

    if args.sched_addr:
        client = Client(args.sched_addr)
    else:
        filterwarnings("ignore",
                       message=".*NVLink.*rmm_pool_size.*",
                       category=UserWarning)

        cluster = Cluster(*cluster_args, **cluster_kwargs)
        if args.multi_node:
            import time

            # Allow some time for workers to start and connect to scheduler
            # TODO: make this a command-line argument?
            time.sleep(15)

        client = Client(scheduler_addr if args.multi_node else cluster)

    if args.type == "gpu":
        client.run(
            setup_memory_pool,
            pool_size=args.rmm_pool_size,
            disable_pool=args.disable_rmm_pool,
            log_directory=args.rmm_log_directory,
        )
        # Create an RMM pool on the scheduler due to occasional deserialization
        # of CUDA objects. May cause issues with InfiniBand otherwise.
        client.run_on_scheduler(
            setup_memory_pool,
            pool_size=1e9,
            disable_pool=args.disable_rmm_pool,
            log_directory=args.rmm_log_directory,
        )

    scheduler_workers = client.run_on_scheduler(get_scheduler_workers)
    n_workers = len(scheduler_workers)
    client.wait_for_workers(n_workers)

    if args.all_to_all:
        all_to_all(client)

    took_list = []
    for _ in range(args.runs - 1):
        took_list.append(run(client, args, n_workers, write_profile=None))
    took_list.append(
        run(client, args, n_workers,
            write_profile=args.profile))  # Only profiling the last run

    # Collect, aggregate, and print peer-to-peer bandwidths
    incoming_logs = client.run(
        lambda dask_worker: dask_worker.incoming_transfer_log)
    bandwidths = defaultdict(list)
    total_nbytes = defaultdict(list)
    for k, L in incoming_logs.items():
        for d in L:
            if d["total"] >= args.ignore_size:
                bandwidths[k, d["who"]].append(d["bandwidth"])
                total_nbytes[k, d["who"]].append(d["total"])
    bandwidths = {(scheduler_workers[w1].name, scheduler_workers[w2].name): [
        "%s/s" % format_bytes(x)
        for x in numpy.quantile(v, [0.25, 0.50, 0.75])
    ]
                  for (w1, w2), v in bandwidths.items()}
    total_nbytes = {(
        scheduler_workers[w1].name,
        scheduler_workers[w2].name,
    ): format_bytes(sum(nb))
                    for (w1, w2), nb in total_nbytes.items()}

    t_runs = numpy.empty(len(took_list))
    if args.markdown:
        print("```")
    print("Shuffle benchmark")
    print("-------------------------------")
    print(f"backend        | {args.backend}")
    print(f"partition-size | {format_bytes(args.partition_size)}")
    print(f"in-parts       | {args.in_parts}")
    print(f"protocol       | {args.protocol}")
    print(f"device(s)      | {args.devs}")
    if args.device_memory_limit:
        print(f"memory-limit   | {format_bytes(args.device_memory_limit)}")
    print(f"rmm-pool       | {(not args.disable_rmm_pool)}")
    if args.protocol == "ucx":
        print(f"tcp            | {args.enable_tcp_over_ucx}")
        print(f"ib             | {args.enable_infiniband}")
        print(f"nvlink         | {args.enable_nvlink}")
    print(f"data-processed | {format_bytes(took_list[0][0])}")
    print("===============================")
    print("Wall-clock     | Throughput")
    print("-------------------------------")
    for idx, (data_processed, took) in enumerate(took_list):
        throughput = int(data_processed / took)
        m = format_time(took)
        m += " " * (15 - len(m))
        print(f"{m}| {format_bytes(throughput)}/s")
        t_runs[idx] = float(format_bytes(throughput).split(" ")[0])
    print("===============================")
    if args.markdown:
        print("\n```")

    if args.plot is not None:
        plot_benchmark(t_runs, args.plot, historical=True)

    if args.backend == "dask":
        if args.markdown:
            print(
                "<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```"
            )
        print("(w1,w2)        | 25% 50% 75% (total nbytes)")
        print("-------------------------------")
        for (d1, d2), bw in sorted(bandwidths.items()):
            fmt = ("(%s,%s)        | %s %s %s (%s)" if args.multi_node or
                   args.sched_addr else "(%02d,%02d)        | %s %s %s (%s)")
            print(fmt % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]))
        if args.markdown:
            print("```\n</details>\n")

    if args.benchmark_json:
        bandwidths_json = {
            "bandwidth_({d1},{d2})_{i}" if args.multi_node or args.sched_addr
            else "(%02d,%02d)_%s" % (d1, d2, i): parse_bytes(v.rstrip("/s"))
            for (d1, d2), bw in sorted(bandwidths.items()) for i, v in zip(
                ["25%", "50%", "75%", "total_nbytes"],
                [bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]],
            )
        }

        with open(args.benchmark_json, "a") as fp:
            for data_processed, took in took_list:
                fp.write(
                    dumps(
                        dict(
                            {
                                "backend": args.backend,
                                "partition_size": args.partition_size,
                                "in_parts": args.in_parts,
                                "protocol": args.protocol,
                                "devs": args.devs,
                                "device_memory_limit":
                                args.device_memory_limit,
                                "rmm_pool": not args.disable_rmm_pool,
                                "tcp": args.enable_tcp_over_ucx,
                                "ib": args.enable_infiniband,
                                "nvlink": args.enable_nvlink,
                                "data_processed": data_processed,
                                "wall_clock": took,
                                "throughput": data_processed / took,
                            },
                            **bandwidths_json,
                        )) + "\n")

    if args.multi_node:
        client.shutdown()
        client.close()
Ejemplo n.º 19
0
class Client(elfi.client.ClientBase):
    """A multiprocessing client using dask."""
    def __init__(self):
        """Initialize a dask client."""
        self.dask_client = DaskClient()
        self.tasks = {}
        self._id_counter = itertools.count()

    def apply(self, kallable, *args, **kwargs):
        """Add `kallable(*args, **kwargs)` to the queue of tasks. Returns immediately.

        Parameters
        ----------
        kallable: callable

        Returns
        -------
        task_id: int

        """
        task_id = self._id_counter.__next__()
        async_result = self.dask_client.submit(kallable, *args, **kwargs)
        self.tasks[task_id] = async_result
        return task_id

    def apply_sync(self, kallable, *args, **kwargs):
        """Call and returns the result of `kallable(*args, **kwargs)`.

        Parameters
        ----------
        kallable: callable

        """
        return self.dask_client.run_on_scheduler(kallable, *args, **kwargs)

    def get_result(self, task_id):
        """Return the result from task identified by `task_id` when it arrives.

        Parameters
        ----------
        task_id: int

        Returns
        -------
        dict

        """
        async_result = self.tasks.pop(task_id)
        return async_result.result()

    def is_ready(self, task_id):
        """Return whether task with identifier `task_id` is ready.

        Parameters
        ----------
        task_id: int

        Returns
        -------
        bool

        """
        return self.tasks[task_id].done()

    def remove_task(self, task_id):
        """Remove task with identifier `task_id` from scheduler.

        Parameters
        ----------
        task_id: int

        """
        async_result = self.tasks.pop(task_id)
        if not async_result.done():
            async_result.cancel()

    def reset(self):
        """Stop all worker processes immediately and clear pending tasks."""
        self.dask_client.shutdown()
        self.tasks.clear()

    @property
    def num_cores(self):
        """Return the number of processes.

        Returns
        -------
        int

        """
        return os.cpu_count()