コード例 #1
0
def run_launcher():
  import ncluster
  if args.aws:
    ncluster.set_backend('aws')

  job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image)
  job.upload(__file__)
  job.upload('util.py')

  sender, receiver = job.tasks
  # kill python just for when tmux session reuse is on
  if not ncluster.running_locally():
    sender._run_raw('killall python', ignore_errors=True)
    receiver._run_raw('killall python', ignore_errors=True)

  if ncluster.get_backend() == 'aws':
    # on AWS probably running in conda DLAMI, switch into TF-enabled env
    job.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
  receiver.run(f'python {__file__} --role=receiver {ip_config}',
               non_blocking=True)
  sender.run(
    f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}')
  print(sender.read('out'))
コード例 #2
0
ファイル: tf_adder.py プロジェクト: cclauss/ncluster
def run_launcher():
  import ncluster

  job = ncluster.make_job('tf_adder', num_tasks=2)
  job.upload(__file__)
  sender, receiver = job.tasks
  if ncluster.get_backend() == 'aws':
    # on AWS probably are running in DLAMI, switch into TF-enabled env
    job.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
  job.tasks[1].run(f'python tf_adder.py --role=receiver {ip_config}', async=True)
  job.tasks[0].run(f'python tf_adder.py --role=sender {ip_config}')
コード例 #3
0
ファイル: tf_adder_tb.py プロジェクト: kmcgrath/ncluster
def run_launcher():
  import ncluster

  job = ncluster.make_job('tf_adder_tb', num_tasks=2)
  job.upload(__file__)
  this_file = os.path.basename(__file__)

  sender, receiver = job.tasks
  if ncluster.get_backend() == 'aws':
    # on AWS probably are running in DLAMI, switch into TF-enabled env
    job.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
  job.tasks[1].run(f'python {this_file} --role=receiver {ip_config}', non_blocking=True)
  job.tasks[0].run(f'python {this_file} --role=sender --logdir={job.logdir} {ip_config}')
  job.tasks[0].run(f'tensorboard --logdir={job.logdir}/..', non_blocking=True)
  print(f"Benchmark done, tensorboard at http://{job.tasks[0].public_ip}:6006")
コード例 #4
0
ファイル: tf_adder.py プロジェクト: timotheecour/ncluster
def run_launcher():
  import ncluster
  if args.aws:
    ncluster.set_backend('aws')

  job = ncluster.make_job('tf_adder', num_tasks=2, image_name=args.image)
  job.upload(__file__)
  
  sender, receiver = job.tasks
  if ncluster.get_backend() == 'aws':
    # on AWS probably running in conda DLAMI, switch into TF-enabled env
    job.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
  receiver.run(f'python tf_adder.py --role=receiver {ip_config}',
               non_blocking=True)
  sender.run(f'python tf_adder.py --role=sender {ip_config} --iters={args.iters}')
コード例 #5
0
ファイル: mpi_two_machines.py プロジェクト: diux-dev/ncluster
def run_launcher():
    import ncluster
    if args.aws:
        ncluster.set_backend('aws')

    job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image)
    job.upload(__file__)
    job.upload('util.py')

    # kill python just for when tmux session reuse is on
    if not ncluster.running_locally():
        job._run_raw('killall python', ignore_errors=True)

    if ncluster.get_backend() == 'aws':
        # on AWS probably running in conda DLAMI, switch into TF-enabled env
        job.run('source activate tensorflow_p36')

    hosts = [task.public_ip for task in job.tasks]
    host_str = ','.join(hosts)
    os.system(
        f'mpirun -np 2 --host {host_str} python {__file__} --role=worker')
    print(job.tasks[0].read('/tmp/out'))
コード例 #6
0
def run_launcher():
  import ncluster

  job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image,
                          instance_type=args.instance_type)
  job.upload(__file__)
  job.upload('util.py')

  # kill python just for when tmux session reuse is on
  if not ncluster.running_locally():
    job._run_raw('killall python', ignore_errors=True)

  if ncluster.get_backend() == 'aws':
    # on AWS probably running in conda DLAMI, switch into TF-enabled env
    # TODO(y) switch to PyTorch enabled
    job.run('source activate tensorflow_p36')
    


  # TODO(y): this should be private ip
  hosts = [task.ip for task in job.tasks]
  host_str = ','.join(hosts)
  os.system(f'/usr/local/mpi/bin/mpirun -np 2 --host {host_str} python {__file__} --role=worker')
  print(job.tasks[0].read('/tmp/out'))