Exemple #1
0
def main():
    task = ncluster.make_task(name='p3-billing-test',
                              instance_type='p3.16xlarge',
                              disk_size=1000,
                              image_name=args.image_name)
    task.run('sudo shutdown now', non_blocking=True)
    task = ncluster.make_task(name='p3-dn-billing-test',
                              instance_type='p3dn.24xlarge',
                              disk_size=1000,
                              image_name=args.image_name)
    task.run('sudo shutdown now', non_blocking=True)
Exemple #2
0
def test_multiple_logdirs():
  logdir1 = ncluster.get_logdir_root() + '/test1'
  dummy_task = ncluster.make_task()
  dummy_task.run(f'rm -Rf {logdir1}')
  task1 = ncluster.make_task(run_name='test1')
  assert task1.logdir == logdir1

  logdir2 = ncluster.get_logdir_root() + '/test2'
  task2 = ncluster.make_task(run_name='test2')
  dummy_task.run(f'rm -Rf {logdir2}*')
  dummy_task.run(f'mkdir {logdir2}')
  assert task2.logdir == logdir2 + '.01'
Exemple #3
0
def run_launcher():
  import ncluster
  ncluster.util.assert_script_in_current_directory()
  
  if args.aws:
    ncluster.set_backend('aws')

  # use 4GB instance, 0.5GB not enough
  worker = ncluster.make_task(args.name, image_name=args.image,
                              instance_type='t3.medium')
  worker.upload(__file__)
  worker.upload('util.py')

  # kill python just for when tmux session reuse is on
  if not ncluster.running_locally():
    # on AWS probably running in conda DLAMI, switch into TF-enabled env
    worker._run_raw('killall python', ignore_errors=True)
    worker.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={worker.ip} --receiver-ip={worker.ip}'
  worker.run(f'python {__file__} --role=receiver {ip_config}',
               non_blocking=True)
  worker.switch_window(1)  # run in new tmux window
  if not ncluster.running_locally():
    worker.run('source activate tensorflow_p36')
  worker.run(
    f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}')
  print(worker.read('out'))
Exemple #4
0
def main():
    task = ncluster.make_task(name=args.name,
                              instance_type=args.instance_type,
                              disk_size=1000,
                              image_name=args.image_name)

    # upload notebook config with provided password
    jupyter_config_fn = _create_jupyter_config(args.password)
    remote_config_fn = '~/.jupyter/jupyter_notebook_config.py'
    task.run(f'source activate {args.conda_env}')
    task.upload(jupyter_config_fn, remote_config_fn)

    task.run(
        'conda install -c conda-forge jupyter_nbextensions_configurator jupyter_contrib_nbextensions -y '
    )
    task.run('jupyter nbextension enable toc2/main')

    # upload sample notebook and start Jupyter server
    task.run('mkdir -p /ncluster/notebooks')
    task.upload(f'{module_path}/gpubox_sample.ipynb',
                '/ncluster/notebooks/gpubox_sample.ipynb',
                dont_overwrite=True)
    task.run('cd /ncluster/notebooks')
    task.run('jupyter notebook', non_blocking=True)
    print(f'Jupyter notebook will be at http://{task.public_ip}:8888')
def main():
  task = ncluster.make_task(instance_type=args.instance,
                            image_name=args.image)
  task.run('source activate tensorflow_p36')
  task.upload('tensorflow_svd_crash.py')
  stdout, stderr = task.run_with_output('python tensorflow_svd_crash.py')
  print(stdout, stderr)
Exemple #6
0
def test_multiple_logdir_tasks():
  n = 10
  dummy_task = ncluster.make_task()
  logdir1 = ncluster.get_logdir_root() + '/test1'
  dummy_task.run(f'rm -Rf {logdir1}')
  job = ncluster.make_job(run_name='test1', num_tasks=n)

  obtained_logdirs = []

  import wrapt

  @wrapt.synchronized
  def query(i):
    obtained_logdirs.append(job.tasks[i].logdir)

  threads = [threading.Thread(target=query, args=(i,)) for i in range(n)]
  for thread in reversed(threads):
    thread.start()

  random.shuffle(threads)
  for thread in threads:
    thread.join()

  assert len(set(obtained_logdirs)) == 1
  assert obtained_logdirs[0] == logdir1
Exemple #7
0
def run_launcher():
    import ncluster

    if args.aws:
        ncluster.set_backend('aws')

    if args.nightly:
        # running locally MacOS
        if 'Darwin' in util.ossystem('uname') and not args.aws:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
        else:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
    else:
        install_script = 'pip install ray'

    worker = ncluster.make_task(name=args.name,
                                install_script=install_script,
                                image_name=args.image)
    if not ncluster.running_locally():
        worker._run_raw('killall python', ignore_errors=True)
    worker.upload(__file__)
    worker.upload('util.py')
    if args.xray:
        worker.run('export RAY_USE_XRAY=1')
    worker.run('ray stop')

    resources = """--resources='{"ps": 1, "worker": 1}'"""
    worker.run(f"ray start --head {resources} --redis-port=6379")
    #  worker.run(f"ray start --redis-address={worker.ip}:6379 {resources}")
    worker.run(
        f'./{__file__} --role=driver --ip={worker.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}'
    )
    print(worker.read('out'))
Exemple #8
0
def launcher():
    import ncluster

    task = ncluster.make_task(
        name=f'fastai_NMT_multi_{args.base}_{args.targ}',
        image_name='Deep Learning AMI (Ubuntu) Version 23.0',
        disk_size=500,  #500 GB disk space
        instance_type='p3.16xlarge'
    )  #'c5.large': CPU, p3.2xlarge: one GPU, 8x=4 GPU, 16x=8GPU
    task.upload(
        'fastai_TransformerNMT_distributed_logging.py')  # send over the file.
    task.upload('transformer.py')  #helper files
    task.upload('seq2seq_metrics.py')
    task.upload('tbc.py')
    task.run('source activate pytorch_p36')
    task.run('conda install -y -c fastai fastai')
    task.run('pip install tb-nightly')
    task.run('pip install future')
    # task.run('wget https://s3.amazonaws.com/fast-ai-nlp/giga-fren.tgz && tar -xvf giga-fren.tgz')  ## for Qs dataset
    task.run('mkdir europarl && cd europarl')
    task.run(
        'wget http://www.statmt.org/europarl/v7/fr-en.tgz && tar -xvf fr-en.tgz && cd ~/'
    )  ## for Qs dataset
    task.run(
        f'python -m torch.distributed.launch --nproc_per_node={args.proc_per_node} '
        f'./fastai_TransformerNMT_distributed_logging.py --mode=worker '
        f'--epochs={args.epochs} --proc_per_node={args.proc_per_node} '
        f'--base={args.base} --targ={args.targ} --save-model',
        stream_output=True)

    name = f'seq2seq_tfrm_{args.base}_{args.targ}'
    task.download(f'{name}.txt')
    task.download(f'{name}.pth')
Exemple #9
0
def launcher():
    task = ncluster.make_task(
        name='mnist',
        image_name='Deep Learning AMI (Ubuntu) Version 23.0',
        instance_type='c5.large')
    task.upload('mnist.py')
    task.run('source activate pytorch_p36')
    task.run('python mnist.py', non_blocking=True)
Exemple #10
0
def test():
  task = ncluster.make_task(image_name=ncluster.aws_backend.GENERIC_SMALL_IMAGE)
  task.run("mkdir /illegal", non_blocking=True)
  task.join(ignore_errors=True)  # this succeed/print error message

  task.run("mkdir /illegal", non_blocking=True)
  with pytest.raises(RuntimeError):
    task.join()  # this should fail
Exemple #11
0
def main():
    task = ncluster.make_task(name=args.name,
                              instance_type=args.instance_type,
                              disk_size=100,
                              image_name=args.image_name)

    task.run('source activate tensorflow_p36')
    task.run(f'tensorboard --logdir={args.logdir_root} --port=6006',
             non_blocking=True)
    print(f'TensorBoard at http://{task.public_ip}:6006')
Exemple #12
0
    def __createTask(self):
        # moved import here, otherwise it asks to install ncluster on AWS machine
        from ncluster import use_aws, make_task
        use_aws()
        taskName = self.args.time
        self.logger.addRow([['Task name', taskName]])
        aws = make_task(instance_type='p3.16xlarge',
                        name=taskName,
                        image_name='Deep Learning AMI (Ubuntu) Version 16.0')

        return aws
Exemple #13
0
def launcher():
    import ncluster

    task = ncluster.make_task(name='fastai_mnist',
                              image_name='Deep Learning AMI (Ubuntu) Version 23.0',
                              instance_type='p3.2xlarge') #'c5.large')
    task.upload('fastai_mnist.py')  # send over the file. 
    task.run('source activate pytorch_p36')
    task.run('conda install -y -c fastai fastai')  ##install fastai
    task.run('python fastai_mnist.py --save-model', stream_output=True)
    task.download('mnist_example.pth')  ## download the model weights
def launcher():
    import ncluster

    task = ncluster.make_task(name='fastai_wk103_multi',
                              image_name='Deep Learning AMI (Ubuntu) Version 23.0',
                              instance_type='p3.8xlarge') #'c5.large': CPU, p3.2xlarge: one GPU,  
    task.upload('fastai_wk103_distributed.py')  # send over the file. 
    task.run('source activate pytorch_p36')
    task.run('conda install -y -c fastai fastai')  ##install fastai
    ## get wiki103 and unzip
    task.run('wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip && unzip wikitext-103-v1.zip')
    task.run(f'python -m torch.distributed.launch --nproc_per_node={args.proc_per_node} '
             f'./fastai_wk103_distributed.py --mode=worker --proc_per_node={args.proc_per_node} --save-model', stream_output=True)
Exemple #15
0
def launcher():
    import ncluster

    task = ncluster.make_task(
        name='fastai_wk103',
        image_name='Deep Learning AMI (Ubuntu) Version 23.0',
        instance_type='p3.2xlarge')  #'c5.large': CPU, p3.2xlarge:GPU
    task.upload('fastai_wk103.py')  # send over the file.
    task.run('source activate pytorch_p36')
    task.run('conda install -y -c fastai fastai')  ##install fastai
    ## get wiki103 and unzip
    task.run(
        'wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip && unzip wikitext-103-v1.zip'
    )

    ## take params from `args` and use in this call here
    ## get this call from a text file?  database query?  Something to specify the tests we are interested in running.
    task.run('python fastai_wk103.py --save-model', stream_output=True)
Exemple #16
0
def main():
    task = ncluster.make_task(name=args.name,
                              instance_type=args.instance_type,
                              image_name=args.image_name)

    # upload notebook config with provided password
    jupyter_config_fn = _create_jupyter_config(args.password)
    remote_config_fn = '~/.jupyter/jupyter_notebook_config.py'
    task.upload(jupyter_config_fn, remote_config_fn)

    # upload sample notebook and start Jupyter server
    task.run('mkdir -p /ncluster/notebooks')
    task.upload(f'{module_path}/gpubox_sample.ipynb',
                '/ncluster/notebooks/gpubox_sample.ipynb',
                dont_overwrite=True)
    task.run('cd /ncluster/notebooks')
    task.run('jupyter notebook', non_blocking=True)
    print(f'Jupyter notebook will be at http://{task.public_ip}:8888')
Exemple #17
0
def launch(instance):
    """Run benchmark on given instance type."""
    task = ncluster.make_task('benchmark-' + instance,
                              instance_type=instance,
                              image_name=args.image)
    task.upload('benchmark.py')
    task.run('source activate tensorflow_p36')
    task.run('pip install torch')
    task.run('export CUDA_VISIBLE_DEVICES=0')
    if args.N:
        task.run(f'export LINALG_BENCHMARK_N={args.N}')
    if args.short:
        task.run(f'export LINALG_BENCHMARK_SHORT={args.N}')

    stdout, stderr = task.run_with_output('python benchmark.py')
    print('=' * 80)
    print(instance)
    print(stdout)
Exemple #18
0
def main():
    task = ncluster.make_task(args.name,
                              instance_type=args.instance_type,
                              image_name=args.image)
    task.run('source activate tensorflow_p36')
    zone = ncluster.get_zone()
    window_title = 'Tensorboard'
    if zone.startswith('us-west-2'):
        window_title = 'Oregon'
    elif zone.startswith('us-east-1'):
        window_title = 'Virginia'
    elif zone.startswith('us-east-2'):
        window_title = 'Ohio'

    task.run(f'cd {ncluster.get_logdir_root()}')
    task.run(
        f'tensorboard --logdir=. --port=6006 --window_title={window_title}',
        async=True)
    print(f'Tensorboard will be at http://{task.public_ip}:6006')
def main():
  task = ncluster.make_task('gpubox', instance_type=args.instance_type,
                           use_spot=args.spot)
  ncluster.join(task)

  print("Task ready for connection, run the following:")
  print("../connect "+args.name)
  print("Alternatively run")
  print(task.connect_instructions)
  print()
  print()
  print()
  print()

  if args.mode == 'jupyter':
    # upload notebook config with provided password
    from notebook.auth import passwd
    sha = passwd(args.password)
    local_config_fn = f'{module_path}/jupyter_notebook_config.py'
    temp_config_fn = '/tmp/'+os.path.basename(local_config_fn)
    # TODO: remove /home/ubuntu
    remote_config_fn = f'/home/ubuntu/.jupyter/{os.path.basename(local_config_fn)}'
    os.system(f'cp {local_config_fn} {temp_config_fn}')
    _replace_lines(temp_config_fn, 'c.NotebookApp.password',
                   f"c.NotebookApp.password = '******'")
    task.upload(temp_config_fn, remote_config_fn)

    # upload sample notebook and start server
    task.switch_tmux('jupyter')
    task.run('mkdir -p /efs/notebooks')
    task.upload(f'{module_path}/sample.ipynb', '/efs/notebooks/sample.ipynb',
               dont_overwrite=True)
    task.run('cd /efs/notebooks')
    task.run('jupyter notebook')
    print(f'Jupyter notebook will be at http://{job.public_ip}:8888')
  elif args.mode == 'tf-benchmark':
    task.run('source activate tensorflow_p36')
    task.upload(__file__)
    task.run('python launch.py --internal-role=worker')
  else:
    assert False, "Unknown --mode, must be jupyter or tf-benchmark."
#!/usr/bin/env python
# Usage:
# ./launch_tensorboard.py
#
# This will launch r5.large machine on AWS with tensoboard, and print URL
# in the console
import ncluster
ncluster.use_aws()

task = ncluster.make_task('tensorboard',
                          instance_type='r5.large',
                          image_name='Deep Learning AMI (Ubuntu) Version 13.0')
task.run('source activate tensorflow_p36')
task.run(f'tensorboard --logdir={task.logdir}/..', async=True)
print(f"Tensorboard at http://{task.public_ip}:6006")
Exemple #21
0
#!/bin/env python
import sys

if not sys.argv[1:]:
    import ncluster
    task = ncluster.make_task(instance_type='t3.micro')
    task.upload(__file__)
    task.run('pip install tensorflow')
    task.run(f'python {__file__} worker')
elif sys.argv[1] == 'worker':
    import tensorflow as tf
    sess = tf.Session()
    ones = tf.ones((1000, 1000))
    result = sess.run(tf.matmul(ones, ones))
    print(f"matmul gave {result.sum()}")
import os
import ncluster

import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
    '--name',
    type=str,
    default='tensorboard',
    help="name of the current run, used for machine naming and rundir name")
parser.add_argument('--image_name',
                    type=str,
                    default='cybertronai01',
                    help="use custom AMI ")
args = parser.parse_args()

task = ncluster.make_task(args.name,
                          instance_type='r5.large',
                          image_name=args.image_name)

task.run('source activate tensorflow_p36')
logdir_root = os.path.dirname(task.logdir)
task.run(f'tensorboard --logdir={ncluster.get_logdir_root()} --port=6006',
         non_blocking=True)
print(f'TensorBoard at http://{task.public_ip}:6006')
def test():
    task = ncluster.make_task('test2')
    for i in range(20):
        task.run('ls', stream_output=True)
Exemple #24
0
def main():
    task = ncluster.make_task(name=args.name,
                              run_name=f"{args.name}",
                              image_name=args.image_name,
                              instance_type=args.instance_type)

    task.upload('*')
    task.run('killall python || echo failed')  # kill previous run
    task.run('source activate pytorch_p36')
    task.run('pip install -r requirements.txt')
    # workaround for https://github.com/tensorflow/models/issues/3995
    task.run('pip install -U protobuf')

    train = open('bookcorpus.filelist.train').read().strip()
    validate = "/ncluster/data/bookcorpus.tfrecords/final_tfrecords_sharded/tf_examples.tfrecord000163"
    test = "/ncluster/data/bookcorpus.tfrecords/final_tfrecords_sharded/tf_examples.tfrecord000164"

    lr = 0.0001  # original learning rate for 256 global batch size/64 GPUs
    lr = lr / (256 / 15)

    cmd = (f"python pretrain_bert.py "
           f"--batch-size 5 "
           f"--tokenizer-type BertWordPieceTokenizer "
           f"--cache-dir cache_dir "
           f"--tokenizer-model-type bert-large-uncased "
           f"--vocab-size 30522 "
           f"--use-tfrecords "
           f"--train-data {train} "
           f"--valid-data {validate} "
           f"--test-data {test} "
           f"--max-preds-per-seq 80 "
           f"--seq-length 512 "
           f"--max-position-embeddings 512 "
           f"--num-layers 16 "
           f"--hidden-size 410 "
           f"--intermediate-size 4096 "
           f"--num-attention-heads 10 "
           f"--hidden-dropout 0.1 "
           f"--attention-dropout 0.1 "
           f"--train-iters 1000000 "
           f"--lr {lr} "
           f"--lr-decay-style linear "
           f"--lr-decay-iters 990000 "
           f"--warmup .01 "
           f"--weight-decay 1e-2 "
           f"--clip-grad 1.0 "
           f"--fp32-layernorm "
           f"--fp32-embedding "
           f"--hysteresis 2 "
           f"--num-workers 2 ")

    # new params
    cmd += f"--logdir {task.logdir} "

    if args.fp16:
        cmd += f"--fp16 "

    task.run(f'echo {cmd} > {task.logdir}/task.cmd')  # save command-line
    task.run(cmd, non_blocking=True)

    print(f"Logging to {task.logdir}")
import os
import ncluster
task = ncluster.make_task('tensorboard',
                          instance_type='r5.large',
                          image_name='cybertronai01')

task.run('source activate tensorflow_p36')
logdir_root = os.path.dirname(task.logdir)
task.run(f'tensorboard --logdir={ncluster.get_logdir_root()} --port=6006',
         non_blocking=True)
print(f'TensorBoard at http://{task.public_ip}:6006')
Exemple #26
0
import ncluster

if __name__ == '__main__':
    ncluster.util.install_pdb_handler()
    #  task = ncluster.make_task(image_name='amzn2-ami-hvm-2.0.20180622.1-x86_64-gp2')
    task = ncluster.make_task(
        name='ncluster-1535154315336620',
        image_name='amzn2-ami-hvm-2.0.20180622.1-x86_64-gp2')

    task.join()
    print(task.file_read('/proc/cpuinfo'))
Exemple #27
0
#!/usr/bin/env python
import ncluster

# allocate default machine type and default image
task = ncluster.make_task()
output = task.run('ifconfig')
print(f"Task ifconfig returned {output}")
#!/usr/bin/env python
# Usage:
# ./launch_tensorboard.py
#
# This will launch r5.large machine on AWS with tensoboard, and print URL
# in the console
import ncluster

task = ncluster.make_task(
    "tensorboard",
    instance_type="r5.large",
    run_name="tensorboard",
    image_name="Deep Learning AMI (Ubuntu) Version 23.0",
)
task.run("source activate tensorflow_p36")
task.run(f"tensorboard --logdir={task.logdir}/..", non_blocking=True)
print(f"Tensorboard at http://{task.public_ip}:6006")
Exemple #29
0
def main():
    config = AttrDefault(lambda: None, config_defaults)

    assert args.config in globals(), f"unknown config {args.config}"
    config.update(eval(args.config))

    job = ncluster.make_job(name=args.name,
                            run_name=f"{args.name}",
                            num_tasks=config.machines,
                            image_name=config.image_name,
                            instance_type=config.instance_type,
                            spot=not args.nospot,
                            skip_setup=args.skip_setup)

    job.rsync('.')
    job.run(f'killall python || echo failed && '  # kill previous run
            f'source activate {config.conda_env} && ' +
            f'pip install -r requirements.txt')

    instance_info = ncluster.aws_backend.INSTANCE_INFO[config.instance_type]
    num_gpus_per_machine = instance_info['gpus']

    total_gpus = num_gpus_per_machine * config.machines
    global_batch_size = config.batch_size * total_gpus

    # linear LR scaling (https://arxiv.org/abs/1706.02677)
    lr = config.base_lr * (global_batch_size / BASE_LR_BATCHSIZE)

    # TODO(y): change dataset location to /data/transformer-xl-data after
    # image is cut
    # worker parameters with training setup
    worker_params = {
        'seed': 1111,
        'data': 'data/wikitext-103',
        'dataset': 'wt103',
        'adaptive': True,
        'log_interval': 100,
        'eval_interval': 1000,
        'logdir': job.logdir,
        'lr': lr,
        'fp16': True,
        'dynamic_loss_scale': True,
        'batch_size': config.batch_size,
    }

    if config.architecture == 'wt103_large':
        worker_params.update(wt103_large)
    elif config.architecture == 'wt103_base':
        worker_params.update(wt103_base)
    else:
        assert False, f"Uknown architecture {config.architecture}"

    nccl_params = f'NCCL_DEBUG=VERSION NCCL_MIN_NRINGS={config.num_rings} '

    for i, task in enumerate(job.tasks):
        dist_params = \
            f'--nproc_per_node={num_gpus_per_machine} ' \
            f'--nnodes={config.machines} --node_rank={i} ' \
            f'--master_addr={job.tasks[0].ip} --master_port={6016} '
        cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} ' \
            f'train.py {util.dict_to_args(worker_params)}'
        task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd')  # save command-line
        task.run(cmd, non_blocking=True)

    print(f"Logging to {job.logdir}")

    if args.launch_tensorboard:
        task = ncluster.make_task('tensorboard',
                                  instance_type='r5.large',
                                  image_name=args.image_name)

        task.run('source activate tensorflow_p36')
        task.run(f'tensorboard --logdir={ncluster.get_logdir_root()} --port=6006',
                 non_blocking=True)
        print(f'TensorBoard at http://{task.public_ip}:6006')
#!/usr/bin/env python
# Usage:
# ./launch_tensorboard.py
#
# This will launch r5.large machine on AWS with tensoboard, and print URL
# in the console
import ncluster
ncluster.use_aws()

task = ncluster.make_task('newton-tb',
                          instance_type='r5.large',
                          image_name='Deep Learning AMI (Ubuntu) Version 16.0')
task.run('source activate tensorflow_p36')
task.run(f'tensorboard --logdir=/ncluster/newton/runs', non_blocking=True)
print(f"Tensorboard at http://{task.public_ip}:6006")