Ejemplo n.º 1
0
def main():
    import aws_backend

    run = aws_backend.make_run(args.name,
                               ami=args.ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)
    job = run.make_job('main', instance_type=args.instance_type)
    job.wait_until_ready()
    print(job.connect_instructions)

    # if tensorboard is running, kill it, it will prevent efs logdir from being
    # deleted
    job.run("tmux kill-session -t tb || echo ok")
    logdir = '/efs/runs/%s/%s' % (args.group, args.name)
    job.run('rm -Rf %s || echo failed' % (logdir, ))  # delete prev logs

    # Launch tensorboard visualizer in separate tmux session
    job.run("tmux new-session -s tb -n 0 -d")
    job.run("tmux send-keys -t tb:0 'source activate mxnet_p36' Enter")
    job.run("tmux send-keys -t tb:0 'tensorboard --logdir %s' Enter" %
            (logdir, ))

    job.run('source activate mxnet_p36')
    job.run('killall python || echo failed')  # kill previous run
    job.run(
        'pip install -U https://s3.amazonaws.com/inferno-dlami/tensorflow/p3/tensorflow-1.5.0-cp36-cp36m-linux_x86_64.whl'
    )
    job.upload('imagenet_utils.py')
    job.upload('resnet_model.py')
    job.upload('resnet.b512.baseline.py')
    job.run_async('python resnet.b512.baseline.py --logdir=%s' % (logdir, ))
Ejemplo n.º 2
0
def main():
    run = aws_backend.make_run(args.name,
                               ami_name=args.ami_name,
                               availability_zone=args.zone,
                               linux_type=args.linux_type,
                               skip_efs_mount=args.skip_efs_mount)
    create_job(run, 'worker', args.num_tasks)
Ejemplo n.º 3
0
def main():
    import aws_backend

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               ami=args.ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)
    job = run.make_job('main', instance_type=args.instance_type)
    job.wait_until_ready()
    print(job.connect_instructions)

    # if tensorboard is running, kill it, it will prevent efs logdir from being
    # deleted
    job.run("tmux kill-session -t tb || echo ok")
    job.run('rm -Rf /efs/runs/yuxin_numpy/mnist-convnet || echo failed'
            )  # delete prev logs

    # Launch tensorboard visualizer in separate tmux session
    job.run("tmux new-session -s tb -n 0 -d")
    job.run("tmux send-keys -t tb:0 'source activate mxnet_p36' Enter")
    job.run(
        "tmux send-keys -t tb:0 'tensorboard --logdir /efs/runs/yuxin_numpy' Enter"
    )

    job.run('source activate mxnet_p36')
    job.upload(module_path + '/mnist-convnet.py')
    job.run('killall python || echo failed')  # kill previous run
    job.run_async('python mnist-convnet.py')
Ejemplo n.º 4
0
def main():
  import aws_backend

  run = aws_backend.make_run(args.name, ami=args.ami,
                             availability_zone=args.zone,
                             linux_type=args.linux_type)
  create_job(run, args.job_name, args.num_tasks)
Ejemplo n.º 5
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    create_resources_lib.create_resources()
    region = u.get_region()
    assert args.zone.startswith(
        region
    ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
        args.zone, region)

    install_script = ''

    ami = args.ami

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               install_script=install_script,
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)
    job = run.make_job('gpubox', instance_type=args.instance)

    job.wait_until_ready()

    job.run('source activate mxnet_p36')
    job.run('sudo apt install -y fio')
    job.run('volume=/dev/xvda1')
    job.run(
        'time sudo fio --filename=$volume --rw=read --bs=128k --iodepth=32 --ioengine=libaio --direct=1 --name=volume-initialize'
    )
Ejemplo n.º 6
0
def main():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    create_resources_lib.create_resources()
    region = u.get_region()
    assert args.zone.startswith(
        region
    ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
        args.zone, region)

    if args.linux_type == 'ubuntu':
        install_script = INSTALL_SCRIPT_UBUNTU
        ami_dict = ami_dict_ubuntu
    elif args.linux_type == 'amazon':
        install_script = INSTALL_SCRIPT_AMAZON
        ami_dict = ami_dict_amazon
    else:
        assert False, "Unknown linux type " + args.linux_type

    if args.ami:
        print(
            "Warning, using provided AMI, make sure that --linux-type argument "
            "is set correctly")
        ami = args.ami
    else:
        assert region in ami_dict, "Define proper AMI mapping for this region."
        ami = ami_dict[region]

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               install_script=install_script,
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)
    worker_job = run.make_job('worker',
                              instance_type=args.instance_type,
                              num_tasks=2)
    ps_job = run.make_job('ps', instance_type=args.instance_type, num_tasks=2)
    worker_job.wait_until_ready()
    ps_job.wait_until_ready()

    worker_job.tasks[0].run_async('sudo iperf3 -s -p 6006')
    worker_job.tasks[1].run('sudo iperf3 -c %s -P 10 -i 1 -t 60 -V -p 6006' %
                            (worker_job.tasks[0].ip, ))
    print("Job ready for connection, run the following:")
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()
Ejemplo n.º 7
0
def main():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import tmux_backend
    import aws_backend
    import util as u

    u.maybe_create_resources()

    run = aws_backend.make_run(args.name, ami_name=args.ami_name)
    job = run.make_job('worker',
                       instance_type=args.instance_type,
                       use_spot=args.spot)
    job.wait_until_ready()

    print("Job ready for connection, run the following:")
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()

    if args.mode == 'jupyter':
        # upload notebook config with provided password
        from notebook.auth import passwd
        sha = passwd(args.password)
        local_config_fn = f'{module_path}/jupyter_notebook_config.py'
        temp_config_fn = '/tmp/' + os.path.basename(local_config_fn)
        remote_config_fn = f'/home/ubuntu/.jupyter/{os.path.basename(local_config_fn)}'
        os.system(f'cp {local_config_fn} {temp_config_fn}')
        _replace_lines(temp_config_fn, 'c.NotebookApp.password',
                       f"c.NotebookApp.password = '******'")
        job.upload(temp_config_fn, remote_config_fn)

        # upload sample notebook and start server
        job.run('mkdir -p /efs/notebooks')
        job.upload(f'{module_path}/sample.ipynb',
                   '/efs/notebooks/sample.ipynb',
                   dont_overwrite=True)
        job.run('cd /efs/notebooks')
        job.run_async('jupyter notebook')
        print(f'Jupyter notebook will be at http://{job.public_ip}:8888')
    elif args.mode == 'tf-benchmark':
        job.run('source activate tensorflow_p36')
        job.upload(__file__)
        job.run('killall python || echo pass')  # kill previous run
        job.run_async('python launch.py --internal-role=worker')
    else:
        assert False, "Unknown --mode, must be jupyter or tf-benchmark."
Ejemplo n.º 8
0
def main():
    run = aws_backend.make_run(args.name,
                               ami=args.ami,
                               ami_name=args.ami_name,
                               availability_zone=args.zone,
                               linux_type=args.linux_type,
                               skip_efs_mount=(not args.mount_efs))
    job = create_job(run, args.job_name, args.num_tasks)

    # Define custom params for training or use a preset above
    params = eval(args.params)
    start_training(
        job,
        params,
        save_tag='testing_refactor',
    )
Ejemplo n.º 9
0
def main():
    params = eval(args.params)
    assert args.num_tasks == -1, "num-tasks is deprecated, it's now specified along with training parameters as --num-tasks."
    assert args.ami_name == '-1', "ami_name is deprecated, it's now specified along with training parameters as --ami-name."
    ami_name = _extract_ami_name(params)
    num_tasks = _extract_num_tasks(params)
    env_name = _extract_env_name(params)

    run = aws_backend.make_run(args.name,
                               ami_name=ami_name,
                               skip_efs_mount=args.skip_efs_mount)

    job = create_job(run, 'worker', num_tasks, env_name)
    run.setup_logdir()  # must happen after first job is created and ready

    # Define custom params for training or use a preset above
    # TODO: move "save_tag" into command-line parameter
    start_training(job, params, save_tag=args.name)
Ejemplo n.º 10
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    create_resources_lib.create_resources()
    region = u.get_region()
    assert args.zone.startswith(
        region
    ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
        args.zone, region)

    ami_dict = ami_dict_ubuntu

    if args.ami:
        print(
            "Warning, using provided AMI, make sure that --linux-type argument "
            "is set correctly")
        ami = args.ami
    else:
        assert region in ami_dict, "Define proper AMI mapping for this region."
        ami = ami_dict[region]

    user_data = """#!/bin/bash
sudo mkdir -p /efs
sudo chmod 777 /efs
echo 'Running user-data!'
echo 'test' > /home/ubuntu/test.txt
echo 'activating pytorch_p36'
source /home/ubuntu/anaconda3/bin/activate pytorch_p36
echo $PS1
echo $PS1 > /home/ubuntu/test2.txt
pip install ray
echo 'INSTALLED ray'
echo 'INSTALLED ray' > /home/ubuntu/test3.txt
"""

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               install_script='',
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type,
                               user_data=user_data)

    job = run.make_job('gpubox', instance_type=args.instance)

    job.wait_until_ready()

    print("Job ready for connection, run the following:")
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()

    job.run('source activate mxnet_p36')
    # as of Jan 26, official version gives incompatible numpy error, so pin to nightly
    # job.run('pip install tensorflow-gpu')
    #  job.run('pip install -U https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.6.0.dev20180126-cp36-cp36m-manylinux1_x86_64.whl')
    job.run(
        'pip install -U http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl'
    )

    job.upload(__file__)
    job.run('killall python || echo failed')  # kill previous run
    job.run_async('python %s --role=worker' % (os.path.basename(__file__)))
Ejemplo n.º 11
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    create_resources_lib.create_resources()
    region = u.get_region()
    assert args.zone.startswith(
        region
    ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
        args.zone, region)

    if args.linux_type == 'ubuntu':
        install_script = INSTALL_SCRIPT_UBUNTU
        ami_dict = ami_dict_ubuntu
    elif args.linux_type == 'amazon':
        install_script = INSTALL_SCRIPT_AMAZON
        ami_dict = ami_dict_amazon
    else:
        assert False, "Unknown linux type " + args.linux_type

    if args.ami:
        print(
            "Warning, using provided AMI, make sure that --linux-type argument "
            "is set correctly")
        ami = args.ami
    else:
        assert region in ami_dict, "Define proper AMI mapping for this region."
        ami = ami_dict[region]

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               install_script=install_script,
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)
    job = run.make_job('gpubox', instance_type=args.instance)

    job.wait_until_ready()

    print("Job ready for connection, run the following:")
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()

    job.run('source activate mxnet_p36')
    # as of Jan 26, official version gives incompatible numpy error, so pin to nightly
    # job.run('pip install tensorflow-gpu')
    #  job.run('pip install -U https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.6.0.dev20180126-cp36-cp36m-manylinux1_x86_64.whl')
    #  job.run('pip install --default-timeout=100 -U http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl')

    job.upload(__file__)
    job.run('killall python || echo failed')  # kill previous run
    job.run_async('python launch.py --role=worker')
Ejemplo n.º 12
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    create_resources_lib.create_resources()
    region = u.get_region()
    assert args.zone.startswith(
        region
    ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
        args.zone, region)

    if args.linux_type == 'ubuntu':
        install_script = INSTALL_SCRIPT_UBUNTU
        ami_dict = ami_dict_ubuntu
    elif args.linux_type == 'amazon':
        install_script = INSTALL_SCRIPT_AMAZON
        ami_dict = ami_dict_amazon
    else:
        assert False, "Unknown linux type " + args.linux_type

    if args.ami:
        print(
            "Warning, using provided AMI, make sure that --linux-type argument "
            "is set correctly")
        ami = args.ami
    else:
        assert region in ami_dict, "Define proper AMI mapping for this region."
        ami = ami_dict[region]

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               install_script=install_script,
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)
    job = run.make_job('worker', instance_type=args.instance_type)

    job.wait_until_ready()

    print("Job ready for connection, run the following:")
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()

    job.run('source activate tensorflow_p36')
    job.run('pip install cython')
    job.run('pip install ray')
    # below can fail on
    # E: Could not get lock /var/lib/dpkg/lock - open (11: Resource temporarily unavailable)
    job.run('sudo apt install htop')

    job.run('yes | sudo apt-get install google-perftools')
    job.run('export LD_PRELOAD="/usr/lib/libtcmalloc.so.4"')

    job.upload(__file__)
    job.upload('tf_numpy_benchmark.py')
    job.run('killall python || echo failed')  # kill previous run
    job.run('python tf_numpy_benchmark.py')