def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') script = os.path.basename(__file__) if args.nightly: if args.macos: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' else: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' else: install_script = 'pip install ray' job = ncluster.make_job(name=args.name, install_script=install_script, image_name=args.image, instance_type=args.instance, num_tasks=args.num_workers + 1) job.upload(script) if args.xray: job.run('export RAY_USE_XRAY=1') job.run('ray stop') # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources driver = job.tasks[0] driver.run(f"ray start --head --redis-port=6379") for worker_task in job.tasks[1:]: worker_resource = """--resources='{"worker": 1}'""" worker_task.run(f"ray start --redis-address={driver.ip}:6379 " f"{worker_resource}") driver.run(f'./{script} --role=driver --ip={driver.ip}:6379')
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image) job.upload(__file__) job.upload('util.py') sender, receiver = job.tasks # kill python just for when tmux session reuse is on if not ncluster.running_locally(): sender._run_raw('killall python', ignore_errors=True) receiver._run_raw('killall python', ignore_errors=True) if ncluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' receiver.run(f'python {__file__} --role=receiver {ip_config}', non_blocking=True) sender.run( f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}') print(sender.read('out'))
def run_launcher(): import ncluster if args.nightly: install_script = 'pip install --no-cache-dir -U ray --find-links ' \ 'https://s3-us-west-2.amazonaws.com/ray-wheels/latest/' else: install_script = 'pip install -U ray' if args.local: ncluster.set_backend('local') job = ncluster.make_job(**vars(args)) job.run(install_script) ps, worker = job.tasks if not ncluster.running_locally(): ps.run('killall python || echo no python found') worker.run('killall || echo no python found') job.run('ray stop') job.upload(__file__) job.upload('util.py') # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources ps_resource = """--resources='{"ps": 1}'""" worker_resource = """--resources='{"worker": 1}'""" ps.run(f"ray start --head {ps_resource} --redis-port=6379") worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}") worker.run(f'python {__file__} --role=driver --ip={ps.ip}:6379 ' f'--hidden_size={args.hidden_size} --num_layers={args.num_layers} ' f'--iters={args.iters}') print(worker.read('out'))
def launcher(): import ncluster if args.aws: ncluster.set_backend('aws') job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image) job.upload(__file__) job.upload('util.py') if args.aws: job.run('source activate pytorch_p36') else: job.run('source deactivate') job.run('source activate ncluster-test3') script_name = os.path.basename(__file__) common_args = f'--size=2 --master-addr={job.tasks[0].ip} --iters={args.iters} --size-mb={args.size_mb}' job.tasks[0].run(f'python {script_name} --role=worker --rank=0 ' + common_args, non_blocking=True) job.tasks[1].run(f'python {script_name} --role=worker --rank=1 ' + common_args, non_blocking=True) job.tasks[0].join() print(job.tasks[0].read('out'))
def run_launcher(): import ncluster ncluster.util.assert_script_in_current_directory() if args.aws: ncluster.set_backend('aws') # use 4GB instance, 0.5GB not enough worker = ncluster.make_task(args.name, image_name=args.image, instance_type='t3.medium') worker.upload(__file__) worker.upload('util.py') # kill python just for when tmux session reuse is on if not ncluster.running_locally(): # on AWS probably running in conda DLAMI, switch into TF-enabled env worker._run_raw('killall python', ignore_errors=True) worker.run('source activate tensorflow_p36') ip_config = f'--sender-ip={worker.ip} --receiver-ip={worker.ip}' worker.run(f'python {__file__} --role=receiver {ip_config}', non_blocking=True) worker.switch_window(1) # run in new tmux window if not ncluster.running_locally(): worker.run('source activate tensorflow_p36') worker.run( f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}') print(worker.read('out'))
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') script = os.path.basename(__file__) assert script in os.listdir('.') job = ncluster.make_job(install_script='pip install ray', image_name=args.image, instance_type='c5.large', num_tasks=2) job.upload(script) job.run('export RAY_USE_XRAY=1') job.run('ray stop') # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources ps_resource = """--resources='{"ps": 1}'""" worker_resource = """--resources='{"worker": 1}'""" ps, worker = job.tasks ps.run(f"ray start --head {ps_resource} --redis-port=6379") worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}") worker.run( f'./{script} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}' )
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') if args.nightly: # running locally MacOS if 'Darwin' in util.ossystem('uname') and not args.aws: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' else: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' else: install_script = 'pip install ray' worker = ncluster.make_task(name=args.name, install_script=install_script, image_name=args.image) if not ncluster.running_locally(): worker._run_raw('killall python', ignore_errors=True) worker.upload(__file__) worker.upload('util.py') if args.xray: worker.run('export RAY_USE_XRAY=1') worker.run('ray stop') resources = """--resources='{"ps": 1, "worker": 1}'""" worker.run(f"ray start --head {resources} --redis-port=6379") # worker.run(f"ray start --redis-address={worker.ip}:6379 {resources}") worker.run( f'./{__file__} --role=driver --ip={worker.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}' ) print(worker.read('out'))
def main(): ncluster.set_backend('local') job = ncluster.make_job(num_tasks=2) start_time = time.time() job.run('sleep 1') print(f"waited for {time.time()-start_time} seconds")
def main(): ncluster.set_backend('aws') start_time = time.time() job = ncluster.make_job(num_tasks=16) print(f"waited for startup for {time.time()-start_time} seconds") start_time = time.time() job.run('sleep 10') print(f"waited for exec for {time.time()-start_time} seconds")
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') job = ncluster.make_job('tf_adder', num_tasks=2, image_name=args.image) job.upload(__file__) sender, receiver = job.tasks if ncluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' receiver.run(f'python tf_adder.py --role=receiver {ip_config}', non_blocking=True) sender.run(f'python tf_adder.py --role=sender {ip_config} --iters={args.iters}')
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') if args.nightly: # running locally MacOS if 'Darwin' in util.ossystem('uname') and not args.aws: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' else: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' else: install_script = 'pip install ray' job = ncluster.make_job(name=args.name, install_script=install_script, image_name=args.image, num_tasks=args.num_workers + args.num_ps) if not ncluster.running_locally(): job._run_raw('killall python', ignore_errors=True) job.upload(__file__) job.upload('util.py') if args.xray: job.run('export RAY_USE_XRAY=1') job.run('ray stop') head = job.tasks[0] # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources worker_resource = """--resources='{"worker": 1}'""" head.run(f"ray start --head {worker_resource} --redis-port=6379") for task in job.tasks[1:]: task.run(f"ray start --redis-address={head.ip}:6379 {worker_resource}") head.run( f'python {__file__} --role=driver --ip={head.ip}:6379 --size-mb={args.size_mb} --iters={args.iters} --num-workers={args.num_workers} --num-ps={args.num_ps}' ) print(head.read('out'))
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image) job.upload(__file__) job.upload('util.py') # kill python just for when tmux session reuse is on if not ncluster.running_locally(): job._run_raw('killall python', ignore_errors=True) if ncluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') hosts = [task.public_ip for task in job.tasks] host_str = ','.join(hosts) os.system( f'mpirun -np 2 --host {host_str} python {__file__} --role=worker') print(job.tasks[0].read('/tmp/out'))
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') job = ncluster.make_job('tf_adder_tb', num_tasks=2, image_name=args.image) job.upload(__file__) this_file = os.path.basename(__file__) sender, receiver = job.tasks if ncluster.get_backend() == 'aws': # on AWS probably are running in DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' job.tasks[1].run(f'python {this_file} --role=receiver {ip_config}', non_blocking=True) job.tasks[0].run( f'python {this_file} --role=sender --logdir={job.logdir} {ip_config}') job.tasks[0].run(f'tensorboard --logdir={job.logdir}/..', non_blocking=True) print( f"Benchmark done, tensorboard at http://{job.tasks[0].public_ip}:6006")
#!/usr/bin/env python import argparse import ncluster import os import time from ncluster import ncluster_globals # setting parameters INSTANCE_TYPE = 'ecs.gn6v-c10g1.20xlarge' NUM_GPUS = 8 ncluster.set_backend('aliyun') parser = argparse.ArgumentParser() parser.add_argument( '--name', type=str, default='fastgpu-perseus-bert', help= "name of the current run, used for machine naming and tensorboard visualization" ) parser.add_argument('--machines', type=int, default=1, help="how many machines to use") args = parser.parse_args() def main(): start_time = time.time() # 1. Create infrastructure
#!/usr/bin/env python import argparse import ncluster import os IMAGE_NAME = 'pytorch.imagenet.source.v7' INSTANCE_TYPE = 'p3.16xlarge' NUM_GPUS = 8 ncluster.set_backend('aws') parser = argparse.ArgumentParser() parser.add_argument( '--name', type=str, default='imagenet', help= "name of the current run, used for machine naming and tensorboard visualization" ) parser.add_argument('--machines', type=int, default=16, help="how many machines to use") args = parser.parse_args() # 109:12 to 93.00 # events: https://s3.amazonaws.com/yaroslavvb/logs/imagenet-1 # logs: https://s3.amazonaws.com/yaroslavvb/logs/imagenet1.tar lr = 1.0 scale_224 = 224 / 512 scale_288 = 128 / 512
def main(): ncluster.set_backend('aws') if args.config: assert not args.instance_type, "specify instance_type as part of config" assert not args.machines, "specify number of machines as part of config" assert re.match('\\w+', args.config) assert args.config in globals(), f'no config called {args.config}' config = eval(args.config) else: # setting config vars through command-line flags assert args.instance_type assert args.machines config = {'base_lr': 0.000125 * 5 / 3, 'local_batch_size': 96, 'instance_type': args.instance_type, 'machines': args.machines} config = AttrDefault(str, config) # easier access to dictionary entries config.image_name = IMAGE_NAME config.conda_env = CONDA_ENV if args.conda_env: config.conda_env = args.conda_env print("Using non-standard conda env ", config.conda_env) if args.image_name: config.image_name = args.image_name print("Using non-standard image ", config.image_name) instance_info = ncluster.aws_backend.INSTANCE_INFO[config.instance_type] num_gpus_per_machine = instance_info['gpus'] job = ncluster.make_job(name=args.name, run_name=f"{args.name}", num_tasks=config.machines, image_name=config.image_name, instance_type=config.instance_type, spot=not args.nospot, skip_setup=args.skip_setup) job.rsync('.') job.run(f'killall python || echo failed && ' # kill previous run f'source activate {config.conda_env} && ' + f'pip install -r requirements.txt') local_batch_size = config.local_batch_size base_lr = config.base_lr num_workers = num_gpus_per_machine * config.machines global_batch_size = local_batch_size * num_workers print("using global batch ", global_batch_size) # 512=8*32*2*1 # linear LR scaling (https://arxiv.org/abs/1706.02677) lr = base_lr * (global_batch_size / BASE_LR_BATCHSIZE) # worker parameters with training setup worker_params = { 'seed': 1111, 'data': 'data/wikitext-103', 'dataset': 'wt103', 'adaptive': True, 'log_interval': 100, 'eval_interval': 500, 'max_tokens': int(1.5e9), 'logdir': job.logdir, 'lr': lr, 'batch_size': local_batch_size, 'eta_min': lr / 10, } worker_params.update(LARGE_ARGS if config.large else SMALL_ARGS) user_params = {} # pass through some user-provided settings that were arguments to the launcher script if args.checkpoint_each_epoch: user_params['checkpoint_each_epoch'] = args.checkpoint_each_epoch if config.warmup_tokens: user_params['warmup_tokens'] = config.warmup_tokens if args.checkpoint or config.checkpoint: user_params['checkpoint'] = util.one_of([args.checkpoint, config.checkpoint]) if args.wiki: worker_params.update({ 'data': 'data/wikiextracted', 'dataset': 'wiki', 'dropatt': 0.1, 'dropout': 0.1, }) if args.bpe: worker_params.update({ 'div_val': 1, 'bpe': True, 'adaptive': False, }) worker_params.update(user_params) if config.extra_worker_params: worker_params.update(config.extra_worker_params) nccl_params = _get_nccl_params() for i, task in enumerate(job.tasks): dist_params = \ f'--nproc_per_node={num_gpus_per_machine} ' \ f'--nnodes={config.machines} --node_rank={i} ' \ f'--master_addr={job.tasks[0].ip} --master_port={6016}' cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} train.py {dict_to_args(worker_params)}' task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line task.run(cmd, non_blocking=True) print(f"Logging to {job.logdir}")
#!/usr/bin/env python # Run crashing TensorFlow SVD example import ncluster ncluster.set_backend('aws') import argparse parser = argparse.ArgumentParser(description='launch') parser.add_argument('--instance', default='c5.9xlarge') parser.add_argument('--image', default="Deep Learning AMI (Amazon Linux) Version 13.0") args = parser.parse_args() def main(): task = ncluster.make_task(instance_type=args.instance, image_name=args.image) task.run('source activate tensorflow_p36') task.upload('tensorflow_svd_crash.py') stdout, stderr = task.run_with_output('python tensorflow_svd_crash.py') print(stdout, stderr) if __name__=='__main__': main()
'--name', type=str, default='txl', help= "name of the current run, used for machine naming and tensorboard visualization" ) parser.add_argument('--machines', type=int, default=1, help="how many machines to use") parser.add_argument("--local", action="store_true", help="enable to run on AWS") args = parser.parse_args() if not args.local: ncluster.set_backend('aws') # routines to build NCCL ring orders def get_nccl_params(num_tasks, num_gpus): if num_tasks <= 1: return 'NCCL_DEBUG=VERSION' return 'NCCL_MIN_NRINGS=4 NCCL_SINGLE_RING_THRESHOLD=10 NCCL_DEBUG=VERSION' def format_params(arg): if isinstance(arg, list) or isinstance(arg, dict): return '\"' + str(arg) + '\"' else: return str(arg)