Ejemplo n.º 1
0
def run_launcher():
  import ncluster
  ncluster.util.assert_script_in_current_directory()
  
  if args.aws:
    ncluster.set_backend('aws')

  # use 4GB instance, 0.5GB not enough
  worker = ncluster.make_task(args.name, image_name=args.image,
                              instance_type='t3.medium')
  worker.upload(__file__)
  worker.upload('util.py')

  # kill python just for when tmux session reuse is on
  if not ncluster.running_locally():
    # on AWS probably running in conda DLAMI, switch into TF-enabled env
    worker._run_raw('killall python', ignore_errors=True)
    worker.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={worker.ip} --receiver-ip={worker.ip}'
  worker.run(f'python {__file__} --role=receiver {ip_config}',
               non_blocking=True)
  worker.switch_window(1)  # run in new tmux window
  if not ncluster.running_locally():
    worker.run('source activate tensorflow_p36')
  worker.run(
    f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}')
  print(worker.read('out'))
def run_launcher():
    import ncluster

    if args.nightly:
        install_script = 'pip install --no-cache-dir -U ray --find-links ' \
                         'https://s3-us-west-2.amazonaws.com/ray-wheels/latest/'
    else:
        install_script = 'pip install -U ray'

    if args.local:
        ncluster.set_backend('local')

    job = ncluster.make_job(**vars(args))
    job.run(install_script)

    ps, worker = job.tasks
    if not ncluster.running_locally():
        ps.run('killall python || echo no python found')
        worker.run('killall || echo no python found')
        job.run('ray stop') 

    job.upload(__file__)
    job.upload('util.py')

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    ps_resource = """--resources='{"ps": 1}'"""
    worker_resource = """--resources='{"worker": 1}'"""

    ps.run(f"ray start --head {ps_resource} --redis-port=6379")
    worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}")
    worker.run(f'python {__file__} --role=driver --ip={ps.ip}:6379 '
               f'--hidden_size={args.hidden_size} --num_layers={args.num_layers} '
               f'--iters={args.iters}')
    print(worker.read('out'))
Ejemplo n.º 3
0
def run_launcher():
  import ncluster
  if args.aws:
    ncluster.set_backend('aws')

  job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image)
  job.upload(__file__)
  job.upload('util.py')

  sender, receiver = job.tasks
  # kill python just for when tmux session reuse is on
  if not ncluster.running_locally():
    sender._run_raw('killall python', ignore_errors=True)
    receiver._run_raw('killall python', ignore_errors=True)

  if ncluster.get_backend() == 'aws':
    # on AWS probably running in conda DLAMI, switch into TF-enabled env
    job.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
  receiver.run(f'python {__file__} --role=receiver {ip_config}',
               non_blocking=True)
  sender.run(
    f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}')
  print(sender.read('out'))
Ejemplo n.º 4
0
def run_launcher():
    import ncluster

    if args.aws:
        ncluster.set_backend('aws')

    if args.nightly:
        # running locally MacOS
        if 'Darwin' in util.ossystem('uname') and not args.aws:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
        else:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
    else:
        install_script = 'pip install ray'

    worker = ncluster.make_task(name=args.name,
                                install_script=install_script,
                                image_name=args.image)
    if not ncluster.running_locally():
        worker._run_raw('killall python', ignore_errors=True)
    worker.upload(__file__)
    worker.upload('util.py')
    if args.xray:
        worker.run('export RAY_USE_XRAY=1')
    worker.run('ray stop')

    resources = """--resources='{"ps": 1, "worker": 1}'"""
    worker.run(f"ray start --head {resources} --redis-port=6379")
    #  worker.run(f"ray start --redis-address={worker.ip}:6379 {resources}")
    worker.run(
        f'./{__file__} --role=driver --ip={worker.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}'
    )
    print(worker.read('out'))
Ejemplo n.º 5
0
def run_launcher():
    import ncluster

    if args.aws:
        ncluster.set_backend('aws')

    if args.nightly:
        # running locally MacOS
        if 'Darwin' in util.ossystem('uname') and not args.aws:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
        else:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
    else:
        install_script = 'pip install ray'

    job = ncluster.make_job(name=args.name,
                            install_script=install_script,
                            image_name=args.image,
                            num_tasks=args.num_workers + args.num_ps)
    if not ncluster.running_locally():
        job._run_raw('killall python', ignore_errors=True)

    job.upload(__file__)
    job.upload('util.py')
    if args.xray:
        job.run('export RAY_USE_XRAY=1')
    job.run('ray stop')

    head = job.tasks[0]

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    worker_resource = """--resources='{"worker": 1}'"""
    head.run(f"ray start --head {worker_resource} --redis-port=6379")

    for task in job.tasks[1:]:
        task.run(f"ray start --redis-address={head.ip}:6379 {worker_resource}")

    head.run(
        f'python {__file__} --role=driver --ip={head.ip}:6379 --size-mb={args.size_mb} --iters={args.iters} --num-workers={args.num_workers} --num-ps={args.num_ps}'
    )

    print(head.read('out'))
Ejemplo n.º 6
0
def run_launcher():
    import ncluster
    if args.aws:
        ncluster.set_backend('aws')

    job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image)
    job.upload(__file__)
    job.upload('util.py')

    # kill python just for when tmux session reuse is on
    if not ncluster.running_locally():
        job._run_raw('killall python', ignore_errors=True)

    if ncluster.get_backend() == 'aws':
        # on AWS probably running in conda DLAMI, switch into TF-enabled env
        job.run('source activate tensorflow_p36')

    hosts = [task.public_ip for task in job.tasks]
    host_str = ','.join(hosts)
    os.system(
        f'mpirun -np 2 --host {host_str} python {__file__} --role=worker')
    print(job.tasks[0].read('/tmp/out'))
Ejemplo n.º 7
0
def run_launcher():
  import ncluster

  job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image,
                          instance_type=args.instance_type)
  job.upload(__file__)
  job.upload('util.py')

  # kill python just for when tmux session reuse is on
  if not ncluster.running_locally():
    job._run_raw('killall python', ignore_errors=True)

  if ncluster.get_backend() == 'aws':
    # on AWS probably running in conda DLAMI, switch into TF-enabled env
    # TODO(y) switch to PyTorch enabled
    job.run('source activate tensorflow_p36')
    


  # TODO(y): this should be private ip
  hosts = [task.ip for task in job.tasks]
  host_str = ','.join(hosts)
  os.system(f'/usr/local/mpi/bin/mpirun -np 2 --host {host_str} python {__file__} --role=worker')
  print(job.tasks[0].read('/tmp/out'))