def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image) job.upload(__file__) job.upload('util.py') sender, receiver = job.tasks # kill python just for when tmux session reuse is on if not ncluster.running_locally(): sender._run_raw('killall python', ignore_errors=True) receiver._run_raw('killall python', ignore_errors=True) if ncluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' receiver.run(f'python {__file__} --role=receiver {ip_config}', non_blocking=True) sender.run( f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}') print(sender.read('out'))
def run_launcher(): import ncluster job = ncluster.make_job('tf_adder', num_tasks=2) job.upload(__file__) sender, receiver = job.tasks if ncluster.get_backend() == 'aws': # on AWS probably are running in DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' job.tasks[1].run(f'python tf_adder.py --role=receiver {ip_config}', async=True) job.tasks[0].run(f'python tf_adder.py --role=sender {ip_config}')
def run_launcher(): import ncluster job = ncluster.make_job('tf_adder_tb', num_tasks=2) job.upload(__file__) this_file = os.path.basename(__file__) sender, receiver = job.tasks if ncluster.get_backend() == 'aws': # on AWS probably are running in DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' job.tasks[1].run(f'python {this_file} --role=receiver {ip_config}', non_blocking=True) job.tasks[0].run(f'python {this_file} --role=sender --logdir={job.logdir} {ip_config}') job.tasks[0].run(f'tensorboard --logdir={job.logdir}/..', non_blocking=True) print(f"Benchmark done, tensorboard at http://{job.tasks[0].public_ip}:6006")
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') job = ncluster.make_job('tf_adder', num_tasks=2, image_name=args.image) job.upload(__file__) sender, receiver = job.tasks if ncluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' receiver.run(f'python tf_adder.py --role=receiver {ip_config}', non_blocking=True) sender.run(f'python tf_adder.py --role=sender {ip_config} --iters={args.iters}')
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image) job.upload(__file__) job.upload('util.py') # kill python just for when tmux session reuse is on if not ncluster.running_locally(): job._run_raw('killall python', ignore_errors=True) if ncluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') hosts = [task.public_ip for task in job.tasks] host_str = ','.join(hosts) os.system( f'mpirun -np 2 --host {host_str} python {__file__} --role=worker') print(job.tasks[0].read('/tmp/out'))
def run_launcher(): import ncluster job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image, instance_type=args.instance_type) job.upload(__file__) job.upload('util.py') # kill python just for when tmux session reuse is on if not ncluster.running_locally(): job._run_raw('killall python', ignore_errors=True) if ncluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env # TODO(y) switch to PyTorch enabled job.run('source activate tensorflow_p36') # TODO(y): this should be private ip hosts = [task.ip for task in job.tasks] host_str = ','.join(hosts) os.system(f'/usr/local/mpi/bin/mpirun -np 2 --host {host_str} python {__file__} --role=worker') print(job.tasks[0].read('/tmp/out'))