def main(): task = ncluster.make_task(name='p3-billing-test', instance_type='p3.16xlarge', disk_size=1000, image_name=args.image_name) task.run('sudo shutdown now', non_blocking=True) task = ncluster.make_task(name='p3-dn-billing-test', instance_type='p3dn.24xlarge', disk_size=1000, image_name=args.image_name) task.run('sudo shutdown now', non_blocking=True)
def test_multiple_logdirs(): logdir1 = ncluster.get_logdir_root() + '/test1' dummy_task = ncluster.make_task() dummy_task.run(f'rm -Rf {logdir1}') task1 = ncluster.make_task(run_name='test1') assert task1.logdir == logdir1 logdir2 = ncluster.get_logdir_root() + '/test2' task2 = ncluster.make_task(run_name='test2') dummy_task.run(f'rm -Rf {logdir2}*') dummy_task.run(f'mkdir {logdir2}') assert task2.logdir == logdir2 + '.01'
def run_launcher(): import ncluster ncluster.util.assert_script_in_current_directory() if args.aws: ncluster.set_backend('aws') # use 4GB instance, 0.5GB not enough worker = ncluster.make_task(args.name, image_name=args.image, instance_type='t3.medium') worker.upload(__file__) worker.upload('util.py') # kill python just for when tmux session reuse is on if not ncluster.running_locally(): # on AWS probably running in conda DLAMI, switch into TF-enabled env worker._run_raw('killall python', ignore_errors=True) worker.run('source activate tensorflow_p36') ip_config = f'--sender-ip={worker.ip} --receiver-ip={worker.ip}' worker.run(f'python {__file__} --role=receiver {ip_config}', non_blocking=True) worker.switch_window(1) # run in new tmux window if not ncluster.running_locally(): worker.run('source activate tensorflow_p36') worker.run( f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}') print(worker.read('out'))
def main(): task = ncluster.make_task(name=args.name, instance_type=args.instance_type, disk_size=1000, image_name=args.image_name) # upload notebook config with provided password jupyter_config_fn = _create_jupyter_config(args.password) remote_config_fn = '~/.jupyter/jupyter_notebook_config.py' task.run(f'source activate {args.conda_env}') task.upload(jupyter_config_fn, remote_config_fn) task.run( 'conda install -c conda-forge jupyter_nbextensions_configurator jupyter_contrib_nbextensions -y ' ) task.run('jupyter nbextension enable toc2/main') # upload sample notebook and start Jupyter server task.run('mkdir -p /ncluster/notebooks') task.upload(f'{module_path}/gpubox_sample.ipynb', '/ncluster/notebooks/gpubox_sample.ipynb', dont_overwrite=True) task.run('cd /ncluster/notebooks') task.run('jupyter notebook', non_blocking=True) print(f'Jupyter notebook will be at http://{task.public_ip}:8888')
def main(): task = ncluster.make_task(instance_type=args.instance, image_name=args.image) task.run('source activate tensorflow_p36') task.upload('tensorflow_svd_crash.py') stdout, stderr = task.run_with_output('python tensorflow_svd_crash.py') print(stdout, stderr)
def test_multiple_logdir_tasks(): n = 10 dummy_task = ncluster.make_task() logdir1 = ncluster.get_logdir_root() + '/test1' dummy_task.run(f'rm -Rf {logdir1}') job = ncluster.make_job(run_name='test1', num_tasks=n) obtained_logdirs = [] import wrapt @wrapt.synchronized def query(i): obtained_logdirs.append(job.tasks[i].logdir) threads = [threading.Thread(target=query, args=(i,)) for i in range(n)] for thread in reversed(threads): thread.start() random.shuffle(threads) for thread in threads: thread.join() assert len(set(obtained_logdirs)) == 1 assert obtained_logdirs[0] == logdir1
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') if args.nightly: # running locally MacOS if 'Darwin' in util.ossystem('uname') and not args.aws: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' else: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' else: install_script = 'pip install ray' worker = ncluster.make_task(name=args.name, install_script=install_script, image_name=args.image) if not ncluster.running_locally(): worker._run_raw('killall python', ignore_errors=True) worker.upload(__file__) worker.upload('util.py') if args.xray: worker.run('export RAY_USE_XRAY=1') worker.run('ray stop') resources = """--resources='{"ps": 1, "worker": 1}'""" worker.run(f"ray start --head {resources} --redis-port=6379") # worker.run(f"ray start --redis-address={worker.ip}:6379 {resources}") worker.run( f'./{__file__} --role=driver --ip={worker.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}' ) print(worker.read('out'))
def launcher(): import ncluster task = ncluster.make_task( name=f'fastai_NMT_multi_{args.base}_{args.targ}', image_name='Deep Learning AMI (Ubuntu) Version 23.0', disk_size=500, #500 GB disk space instance_type='p3.16xlarge' ) #'c5.large': CPU, p3.2xlarge: one GPU, 8x=4 GPU, 16x=8GPU task.upload( 'fastai_TransformerNMT_distributed_logging.py') # send over the file. task.upload('transformer.py') #helper files task.upload('seq2seq_metrics.py') task.upload('tbc.py') task.run('source activate pytorch_p36') task.run('conda install -y -c fastai fastai') task.run('pip install tb-nightly') task.run('pip install future') # task.run('wget https://s3.amazonaws.com/fast-ai-nlp/giga-fren.tgz && tar -xvf giga-fren.tgz') ## for Qs dataset task.run('mkdir europarl && cd europarl') task.run( 'wget http://www.statmt.org/europarl/v7/fr-en.tgz && tar -xvf fr-en.tgz && cd ~/' ) ## for Qs dataset task.run( f'python -m torch.distributed.launch --nproc_per_node={args.proc_per_node} ' f'./fastai_TransformerNMT_distributed_logging.py --mode=worker ' f'--epochs={args.epochs} --proc_per_node={args.proc_per_node} ' f'--base={args.base} --targ={args.targ} --save-model', stream_output=True) name = f'seq2seq_tfrm_{args.base}_{args.targ}' task.download(f'{name}.txt') task.download(f'{name}.pth')
def launcher(): task = ncluster.make_task( name='mnist', image_name='Deep Learning AMI (Ubuntu) Version 23.0', instance_type='c5.large') task.upload('mnist.py') task.run('source activate pytorch_p36') task.run('python mnist.py', non_blocking=True)
def test(): task = ncluster.make_task(image_name=ncluster.aws_backend.GENERIC_SMALL_IMAGE) task.run("mkdir /illegal", non_blocking=True) task.join(ignore_errors=True) # this succeed/print error message task.run("mkdir /illegal", non_blocking=True) with pytest.raises(RuntimeError): task.join() # this should fail
def main(): task = ncluster.make_task(name=args.name, instance_type=args.instance_type, disk_size=100, image_name=args.image_name) task.run('source activate tensorflow_p36') task.run(f'tensorboard --logdir={args.logdir_root} --port=6006', non_blocking=True) print(f'TensorBoard at http://{task.public_ip}:6006')
def __createTask(self): # moved import here, otherwise it asks to install ncluster on AWS machine from ncluster import use_aws, make_task use_aws() taskName = self.args.time self.logger.addRow([['Task name', taskName]]) aws = make_task(instance_type='p3.16xlarge', name=taskName, image_name='Deep Learning AMI (Ubuntu) Version 16.0') return aws
def launcher(): import ncluster task = ncluster.make_task(name='fastai_mnist', image_name='Deep Learning AMI (Ubuntu) Version 23.0', instance_type='p3.2xlarge') #'c5.large') task.upload('fastai_mnist.py') # send over the file. task.run('source activate pytorch_p36') task.run('conda install -y -c fastai fastai') ##install fastai task.run('python fastai_mnist.py --save-model', stream_output=True) task.download('mnist_example.pth') ## download the model weights
def launcher(): import ncluster task = ncluster.make_task(name='fastai_wk103_multi', image_name='Deep Learning AMI (Ubuntu) Version 23.0', instance_type='p3.8xlarge') #'c5.large': CPU, p3.2xlarge: one GPU, task.upload('fastai_wk103_distributed.py') # send over the file. task.run('source activate pytorch_p36') task.run('conda install -y -c fastai fastai') ##install fastai ## get wiki103 and unzip task.run('wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip && unzip wikitext-103-v1.zip') task.run(f'python -m torch.distributed.launch --nproc_per_node={args.proc_per_node} ' f'./fastai_wk103_distributed.py --mode=worker --proc_per_node={args.proc_per_node} --save-model', stream_output=True)
def launcher(): import ncluster task = ncluster.make_task( name='fastai_wk103', image_name='Deep Learning AMI (Ubuntu) Version 23.0', instance_type='p3.2xlarge') #'c5.large': CPU, p3.2xlarge:GPU task.upload('fastai_wk103.py') # send over the file. task.run('source activate pytorch_p36') task.run('conda install -y -c fastai fastai') ##install fastai ## get wiki103 and unzip task.run( 'wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip && unzip wikitext-103-v1.zip' ) ## take params from `args` and use in this call here ## get this call from a text file? database query? Something to specify the tests we are interested in running. task.run('python fastai_wk103.py --save-model', stream_output=True)
def main(): task = ncluster.make_task(name=args.name, instance_type=args.instance_type, image_name=args.image_name) # upload notebook config with provided password jupyter_config_fn = _create_jupyter_config(args.password) remote_config_fn = '~/.jupyter/jupyter_notebook_config.py' task.upload(jupyter_config_fn, remote_config_fn) # upload sample notebook and start Jupyter server task.run('mkdir -p /ncluster/notebooks') task.upload(f'{module_path}/gpubox_sample.ipynb', '/ncluster/notebooks/gpubox_sample.ipynb', dont_overwrite=True) task.run('cd /ncluster/notebooks') task.run('jupyter notebook', non_blocking=True) print(f'Jupyter notebook will be at http://{task.public_ip}:8888')
def launch(instance): """Run benchmark on given instance type.""" task = ncluster.make_task('benchmark-' + instance, instance_type=instance, image_name=args.image) task.upload('benchmark.py') task.run('source activate tensorflow_p36') task.run('pip install torch') task.run('export CUDA_VISIBLE_DEVICES=0') if args.N: task.run(f'export LINALG_BENCHMARK_N={args.N}') if args.short: task.run(f'export LINALG_BENCHMARK_SHORT={args.N}') stdout, stderr = task.run_with_output('python benchmark.py') print('=' * 80) print(instance) print(stdout)
def main(): task = ncluster.make_task(args.name, instance_type=args.instance_type, image_name=args.image) task.run('source activate tensorflow_p36') zone = ncluster.get_zone() window_title = 'Tensorboard' if zone.startswith('us-west-2'): window_title = 'Oregon' elif zone.startswith('us-east-1'): window_title = 'Virginia' elif zone.startswith('us-east-2'): window_title = 'Ohio' task.run(f'cd {ncluster.get_logdir_root()}') task.run( f'tensorboard --logdir=. --port=6006 --window_title={window_title}', async=True) print(f'Tensorboard will be at http://{task.public_ip}:6006')
def main(): task = ncluster.make_task('gpubox', instance_type=args.instance_type, use_spot=args.spot) ncluster.join(task) print("Task ready for connection, run the following:") print("../connect "+args.name) print("Alternatively run") print(task.connect_instructions) print() print() print() print() if args.mode == 'jupyter': # upload notebook config with provided password from notebook.auth import passwd sha = passwd(args.password) local_config_fn = f'{module_path}/jupyter_notebook_config.py' temp_config_fn = '/tmp/'+os.path.basename(local_config_fn) # TODO: remove /home/ubuntu remote_config_fn = f'/home/ubuntu/.jupyter/{os.path.basename(local_config_fn)}' os.system(f'cp {local_config_fn} {temp_config_fn}') _replace_lines(temp_config_fn, 'c.NotebookApp.password', f"c.NotebookApp.password = '******'") task.upload(temp_config_fn, remote_config_fn) # upload sample notebook and start server task.switch_tmux('jupyter') task.run('mkdir -p /efs/notebooks') task.upload(f'{module_path}/sample.ipynb', '/efs/notebooks/sample.ipynb', dont_overwrite=True) task.run('cd /efs/notebooks') task.run('jupyter notebook') print(f'Jupyter notebook will be at http://{job.public_ip}:8888') elif args.mode == 'tf-benchmark': task.run('source activate tensorflow_p36') task.upload(__file__) task.run('python launch.py --internal-role=worker') else: assert False, "Unknown --mode, must be jupyter or tf-benchmark."
#!/usr/bin/env python # Usage: # ./launch_tensorboard.py # # This will launch r5.large machine on AWS with tensoboard, and print URL # in the console import ncluster ncluster.use_aws() task = ncluster.make_task('tensorboard', instance_type='r5.large', image_name='Deep Learning AMI (Ubuntu) Version 13.0') task.run('source activate tensorflow_p36') task.run(f'tensorboard --logdir={task.logdir}/..', async=True) print(f"Tensorboard at http://{task.public_ip}:6006")
#!/bin/env python import sys if not sys.argv[1:]: import ncluster task = ncluster.make_task(instance_type='t3.micro') task.upload(__file__) task.run('pip install tensorflow') task.run(f'python {__file__} worker') elif sys.argv[1] == 'worker': import tensorflow as tf sess = tf.Session() ones = tf.ones((1000, 1000)) result = sess.run(tf.matmul(ones, ones)) print(f"matmul gave {result.sum()}")
import os import ncluster import argparse parser = argparse.ArgumentParser() parser.add_argument( '--name', type=str, default='tensorboard', help="name of the current run, used for machine naming and rundir name") parser.add_argument('--image_name', type=str, default='cybertronai01', help="use custom AMI ") args = parser.parse_args() task = ncluster.make_task(args.name, instance_type='r5.large', image_name=args.image_name) task.run('source activate tensorflow_p36') logdir_root = os.path.dirname(task.logdir) task.run(f'tensorboard --logdir={ncluster.get_logdir_root()} --port=6006', non_blocking=True) print(f'TensorBoard at http://{task.public_ip}:6006')
def test(): task = ncluster.make_task('test2') for i in range(20): task.run('ls', stream_output=True)
def main(): task = ncluster.make_task(name=args.name, run_name=f"{args.name}", image_name=args.image_name, instance_type=args.instance_type) task.upload('*') task.run('killall python || echo failed') # kill previous run task.run('source activate pytorch_p36') task.run('pip install -r requirements.txt') # workaround for https://github.com/tensorflow/models/issues/3995 task.run('pip install -U protobuf') train = open('bookcorpus.filelist.train').read().strip() validate = "/ncluster/data/bookcorpus.tfrecords/final_tfrecords_sharded/tf_examples.tfrecord000163" test = "/ncluster/data/bookcorpus.tfrecords/final_tfrecords_sharded/tf_examples.tfrecord000164" lr = 0.0001 # original learning rate for 256 global batch size/64 GPUs lr = lr / (256 / 15) cmd = (f"python pretrain_bert.py " f"--batch-size 5 " f"--tokenizer-type BertWordPieceTokenizer " f"--cache-dir cache_dir " f"--tokenizer-model-type bert-large-uncased " f"--vocab-size 30522 " f"--use-tfrecords " f"--train-data {train} " f"--valid-data {validate} " f"--test-data {test} " f"--max-preds-per-seq 80 " f"--seq-length 512 " f"--max-position-embeddings 512 " f"--num-layers 16 " f"--hidden-size 410 " f"--intermediate-size 4096 " f"--num-attention-heads 10 " f"--hidden-dropout 0.1 " f"--attention-dropout 0.1 " f"--train-iters 1000000 " f"--lr {lr} " f"--lr-decay-style linear " f"--lr-decay-iters 990000 " f"--warmup .01 " f"--weight-decay 1e-2 " f"--clip-grad 1.0 " f"--fp32-layernorm " f"--fp32-embedding " f"--hysteresis 2 " f"--num-workers 2 ") # new params cmd += f"--logdir {task.logdir} " if args.fp16: cmd += f"--fp16 " task.run(f'echo {cmd} > {task.logdir}/task.cmd') # save command-line task.run(cmd, non_blocking=True) print(f"Logging to {task.logdir}")
import os import ncluster task = ncluster.make_task('tensorboard', instance_type='r5.large', image_name='cybertronai01') task.run('source activate tensorflow_p36') logdir_root = os.path.dirname(task.logdir) task.run(f'tensorboard --logdir={ncluster.get_logdir_root()} --port=6006', non_blocking=True) print(f'TensorBoard at http://{task.public_ip}:6006')
import ncluster if __name__ == '__main__': ncluster.util.install_pdb_handler() # task = ncluster.make_task(image_name='amzn2-ami-hvm-2.0.20180622.1-x86_64-gp2') task = ncluster.make_task( name='ncluster-1535154315336620', image_name='amzn2-ami-hvm-2.0.20180622.1-x86_64-gp2') task.join() print(task.file_read('/proc/cpuinfo'))
#!/usr/bin/env python import ncluster # allocate default machine type and default image task = ncluster.make_task() output = task.run('ifconfig') print(f"Task ifconfig returned {output}")
#!/usr/bin/env python # Usage: # ./launch_tensorboard.py # # This will launch r5.large machine on AWS with tensoboard, and print URL # in the console import ncluster task = ncluster.make_task( "tensorboard", instance_type="r5.large", run_name="tensorboard", image_name="Deep Learning AMI (Ubuntu) Version 23.0", ) task.run("source activate tensorflow_p36") task.run(f"tensorboard --logdir={task.logdir}/..", non_blocking=True) print(f"Tensorboard at http://{task.public_ip}:6006")
def main(): config = AttrDefault(lambda: None, config_defaults) assert args.config in globals(), f"unknown config {args.config}" config.update(eval(args.config)) job = ncluster.make_job(name=args.name, run_name=f"{args.name}", num_tasks=config.machines, image_name=config.image_name, instance_type=config.instance_type, spot=not args.nospot, skip_setup=args.skip_setup) job.rsync('.') job.run(f'killall python || echo failed && ' # kill previous run f'source activate {config.conda_env} && ' + f'pip install -r requirements.txt') instance_info = ncluster.aws_backend.INSTANCE_INFO[config.instance_type] num_gpus_per_machine = instance_info['gpus'] total_gpus = num_gpus_per_machine * config.machines global_batch_size = config.batch_size * total_gpus # linear LR scaling (https://arxiv.org/abs/1706.02677) lr = config.base_lr * (global_batch_size / BASE_LR_BATCHSIZE) # TODO(y): change dataset location to /data/transformer-xl-data after # image is cut # worker parameters with training setup worker_params = { 'seed': 1111, 'data': 'data/wikitext-103', 'dataset': 'wt103', 'adaptive': True, 'log_interval': 100, 'eval_interval': 1000, 'logdir': job.logdir, 'lr': lr, 'fp16': True, 'dynamic_loss_scale': True, 'batch_size': config.batch_size, } if config.architecture == 'wt103_large': worker_params.update(wt103_large) elif config.architecture == 'wt103_base': worker_params.update(wt103_base) else: assert False, f"Uknown architecture {config.architecture}" nccl_params = f'NCCL_DEBUG=VERSION NCCL_MIN_NRINGS={config.num_rings} ' for i, task in enumerate(job.tasks): dist_params = \ f'--nproc_per_node={num_gpus_per_machine} ' \ f'--nnodes={config.machines} --node_rank={i} ' \ f'--master_addr={job.tasks[0].ip} --master_port={6016} ' cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} ' \ f'train.py {util.dict_to_args(worker_params)}' task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line task.run(cmd, non_blocking=True) print(f"Logging to {job.logdir}") if args.launch_tensorboard: task = ncluster.make_task('tensorboard', instance_type='r5.large', image_name=args.image_name) task.run('source activate tensorflow_p36') task.run(f'tensorboard --logdir={ncluster.get_logdir_root()} --port=6006', non_blocking=True) print(f'TensorBoard at http://{task.public_ip}:6006')
#!/usr/bin/env python # Usage: # ./launch_tensorboard.py # # This will launch r5.large machine on AWS with tensoboard, and print URL # in the console import ncluster ncluster.use_aws() task = ncluster.make_task('newton-tb', instance_type='r5.large', image_name='Deep Learning AMI (Ubuntu) Version 16.0') task.run('source activate tensorflow_p36') task.run(f'tensorboard --logdir=/ncluster/newton/runs', non_blocking=True) print(f"Tensorboard at http://{task.public_ip}:6006")