def test_stream_output(): name = "testjob" job = aws.tf_job(name, 4) job.wait_until_ready() task = job.tasks[0] task.run('cd Dropbox && ls') time.sleep(0.5) # async ... todo: expose thread and join instead of sleep? os.system('cat '+task.last_stdout)
def test_new_job(): name = "testjob" instances = toby_aws.LookupAwsInstances(instance_tag=name) assert not instances, "Instances already exist, kill them first" job = aws.tf_job(name, 2) instances = toby_aws.LookupAwsInstances(instance_tag=name) assert len(instances) == 2
def test_stream_output(): name = "testjob" job = aws.tf_job(name, 4) job.wait_until_ready() task = job.tasks[0] task.run('cd Dropbox && ls') time.sleep(0.5) # async ... todo: expose thread and join instead of sleep? os.system('cat ' + task.last_stdout)
def launch_aws(): ps_job = aws.tf_job(FLAGS.run + '-ps', FLAGS.num_ps) worker_job = aws.tf_job(FLAGS.run + '-worker', FLAGS.num_workers, placement_group='tf') tb_job = aws.tf_job(FLAGS.run + '-tb', 1, placement_group='tf') # wait for everything to come up # todo: private IP's may be known before instances are ready ps_job.wait_for_ready() worker_job.wait_for_ready() # TODO: orchestration may be easier if I save server spec to a predictable # location on AWS rather than passing it to each worker through command-line # Orchestration: every worker needs to know: # 1. their own role (task_spec), ie {type: worker, index: 0} # 2. role->ip mapping of all machines (cluster_spec), ie # {"worker": ["localhost:24724"], "ps": ["localhost:15960"]}} ps_hosts = ["%s:%d" % (task.ip, task.port) for task in ps_job.tasks] worker_hosts = [ "%s:%d" % (task.ip, task.port) for task in worker_job.tasks ] cluster_spec = {'worker': worker_hosts, 'ps': ps_hosts} # launch parameter server tasks task_type = 'ps' for task in ps_job.tasks: task_spec = {'type': task_type, 'index': task.id} task.run(generate_tf_env_setup_cmd(cluster_spec, task_spec)) task.run(PS_CMD) # launch worker tasks task_type = 'worker' # task type can also be "chief", overlapping with worker for task in worker_job.tasks: task_spec = {'type': task_type, 'index': task.id} task.run(generate_tf_env_setup_cmd(cluster_spec, task_spec)) task.run(WORKER_CMD) # launch tensorboard visualizer task = tb_job.tasks[0] task.run('tensorboard --port=%d --logdir=%s' % (task.port, logdir))
def test_send_file(): name = "testjob" job = aws.tf_job(name, 4) job.wait_until_ready() task0 = job.tasks[0] secret_word = "testfile3" os.system("echo '%s' > upload_test.txt"%(secret_word,)) task0.upload('upload_test.txt') stdout,stderr = task0.run_sync("cat upload_test.txt") print(stdout) # => testfile2 assert stdout.strip() == secret_word
def test_send_file(): name = "testjob" job = aws.tf_job(name, 4) job.wait_until_ready() task0 = job.tasks[0] secret_word = "testfile3" os.system("echo '%s' > upload_test.txt" % (secret_word, )) task0.upload('upload_test.txt') stdout, stderr = task0.run_sync("cat upload_test.txt") print(stdout) # => testfile2 assert stdout.strip() == secret_word
def launch_aws(): ps_job = aws.tf_job(FLAGS.run+'-ps', FLAGS.num_ps) worker_job = aws.tf_job(FLAGS.run+'-worker', FLAGS.num_workers, placement_group='tf') tb_job = aws.tf_job(FLAGS.run+'-tb', 1, placement_group='tf') # wait for everything to come up # todo: private IP's may be known before instances are ready ps_job.wait_for_ready() worker_job.wait_for_ready() # TODO: orchestration may be easier if I save server spec to a predictable # location on AWS rather than passing it to each worker through command-line # Orchestration: every worker needs to know: # 1. their own role (task_spec), ie {type: worker, index: 0} # 2. role->ip mapping of all machines (cluster_spec), ie # {"worker": ["localhost:24724"], "ps": ["localhost:15960"]}} ps_hosts = ["%s:%d"%(task.ip, task.port) for task in ps_job.tasks] worker_hosts = ["%s:%d"%(task.ip, task.port) for task in worker_job.tasks] cluster_spec = {'worker': worker_hosts, 'ps': ps_hosts} # launch parameter server tasks task_type = 'ps' for task in ps_job.tasks: task_spec = {'type': task_type, 'index': task.id} task.run(generate_tf_env_setup_cmd(cluster_spec, task_spec)) task.run(PS_CMD) # launch worker tasks task_type = 'worker' # task type can also be "chief", overlapping with worker for task in worker_job.tasks: task_spec = {'type': task_type, 'index': task.id} task.run(generate_tf_env_setup_cmd(cluster_spec, task_spec)) task.run(WORKER_CMD) # launch tensorboard visualizer task = tb_job.tasks[0] task.run('tensorboard --port=%d --logdir=%s'%(task.port, logdir))
def launcher(do_local=False): if FLAGS.cluster == 'local': import tmux job = tmux.tf_job('myjob', 1) elif FLAGS.cluster == 'aws': import aws job = aws.tf_job('myjob', 1) else: assert False, "Unknown cluster "+FLAGS.cluster task = job.tasks[0] task.upload(__file__) # copies current script onto machine setup_cmd = ("source ~/.bashrc && export PATH=~/anaconda3/bin:$PATH && " "source activate tf") task.run("%s && python %s --role=worker" % (setup_cmd, __file__,)) print("To see the output: tail -f %s" %(task.last_stdout)) print("To interact with the task, do "+task.connect_instructions)
def cnn_launcher(): """Experiment launcher.""" import boto3 ec2 = boto3.client('ec2') if not FLAGS.disable_placement: placement_group = FLAGS.run try: response = ec2.create_placement_group(GroupName=placement_group, Strategy='cluster') except Exception as e: if 'Duplicate' in e.response['Error']['Code']: print("Warning, placement group %s already exists, skipping" %(placement_group,)) print("Got message "+str(e)) else: placement_group = '' ps_job = aws.tf_job(FLAGS.run+'/ps', FLAGS.num_ps, instance_type=FLAGS.ps_type, placement_group=placement_group) worker_job = aws.tf_job(FLAGS.run+'/worker', FLAGS.num_workers, instance_type=FLAGS.worker_type, placement_group=placement_group) ps_job.wait_until_ready() worker_job.wait_until_ready() # Orchestration: every worker needs to know: # 1. their own role (task_spec), ie {type: worker, index: 0} # 2. role->ip mapping of all machines (cluster_spec), ie # {"worker": ["localhost:24724"], "ps": ["localhost:15960"]}} ps_hosts = ["%s:%d"%(task.ip, task.port) for task in ps_job.tasks] ps_hosts_str = ','.join(ps_hosts) worker_hosts = ["%s:%d"%(task.ip, task.port) for task in worker_job.tasks] worker_hosts_str = ','.join(worker_hosts) cluster_spec = {'worker': worker_hosts, 'ps': ps_hosts} setup_cmd = "source ~/.bashrc && export PATH=~/anaconda3/bin:$PATH && source activate py2 && cd ~/git0/benchmarks/scripts/tf_cnn_benchmarks" # kill previous running processes in case we are reusing instances for task in ps_job.tasks: task.run("killall python") for task in worker_job.tasks: task.run("killall python") time.sleep(5) # launch parameter server tasks task_type = 'ps' cmds = [] ps_cmd_tmpl = "CUDA_VISIBLE_DEVICES='' python tf_cnn_benchmarks.py --local_parameter_device=gpu --worker_hosts=%(worker_hosts)s --ps_hosts=%(ps_hosts)s --job_name=ps --task_index=%(task_index)s" for task in ps_job.tasks: cmds = [] task_spec = {'type': task_type, 'index': task.id} cmds.append(setup_cmd) cmds.append(tf_config_cmd(cluster_spec, task_spec)) task.upload("variable_mgr.py", "/home/ubuntu/Dropbox/git0/benchmarks/scripts/tf_cnn_benchmarks/variable_mgr.py") cmds.append(ps_cmd_tmpl % {"worker_hosts": worker_hosts_str, "ps_hosts": ps_hosts_str, "job_name": task_type, "task_index": task.id}) task.run(' && '.join(cmds)) print("To see the output: tail -f %s" %(task.last_stdout)) # launch worker tasks task_type = 'worker' cmds = [] worker_cmd_tmpl = "python tf_cnn_benchmarks.py --data_format=NCHW --batch_size=64 --num_batches=1000 --model=resnet50 --optimizer=sgd --variable_update=distributed_replicated --cross_replica_sync=True --local_parameter_device=gpu --num_gpus=8 --nodistortions --display_every=10 --worker_hosts=%(worker_hosts)s --ps_hosts=%(ps_hosts)s --job_name=worker --task_index=%(task_index)s" for task in worker_job.tasks: cmds = [] task_spec = {'type': task_type, 'index': task.id} cmds.append(setup_cmd) cmds.append(tf_config_cmd(cluster_spec, task_spec)) task.upload("variable_mgr.py", "/home/ubuntu/Dropbox/git0/benchmarks/scripts/tf_cnn_benchmarks/variable_mgr.py") cmds.append(worker_cmd_tmpl % {"worker_hosts": worker_hosts_str, "ps_hosts": ps_hosts_str, "job_name": task_type, "task_index": task.id}) task.run(' && '.join(cmds)) print("To see the output of %s: tail -f %s" %(task.id, task.last_stdout))
def cnn_launcher(): """Experiment launcher.""" import boto3 ec2 = boto3.client('ec2') if not FLAGS.disable_placement: placement_group = FLAGS.run try: response = ec2.create_placement_group(GroupName=placement_group, Strategy='cluster') except Exception as e: if 'Duplicate' in e.response['Error']['Code']: print("Warning, placement group %s already exists, skipping" % (placement_group, )) print("Got message " + str(e)) else: placement_group = '' ps_job = aws.tf_job(FLAGS.run + '/ps', FLAGS.num_ps, instance_type=FLAGS.ps_type, placement_group=placement_group) worker_job = aws.tf_job(FLAGS.run + '/worker', FLAGS.num_workers, instance_type=FLAGS.worker_type, placement_group=placement_group) ps_job.wait_until_ready() worker_job.wait_until_ready() # Orchestration: every worker needs to know: # 1. their own role (task_spec), ie {type: worker, index: 0} # 2. role->ip mapping of all machines (cluster_spec), ie # {"worker": ["localhost:24724"], "ps": ["localhost:15960"]}} ps_hosts = ["%s:%d" % (task.ip, task.port) for task in ps_job.tasks] ps_hosts_str = ','.join(ps_hosts) worker_hosts = [ "%s:%d" % (task.ip, task.port) for task in worker_job.tasks ] worker_hosts_str = ','.join(worker_hosts) cluster_spec = {'worker': worker_hosts, 'ps': ps_hosts} setup_cmd = "source ~/.bashrc && export PATH=~/anaconda3/bin:$PATH && source activate py2 && cd ~/git0/benchmarks/scripts/tf_cnn_benchmarks" # kill previous running processes in case we are reusing instances for task in ps_job.tasks: task.run("killall python") for task in worker_job.tasks: task.run("killall python") time.sleep(5) # launch parameter server tasks task_type = 'ps' cmds = [] ps_cmd_tmpl = "CUDA_VISIBLE_DEVICES='' python tf_cnn_benchmarks.py --local_parameter_device=gpu --worker_hosts=%(worker_hosts)s --ps_hosts=%(ps_hosts)s --job_name=ps --task_index=%(task_index)s" for task in ps_job.tasks: cmds = [] task_spec = {'type': task_type, 'index': task.id} cmds.append(setup_cmd) cmds.append(tf_config_cmd(cluster_spec, task_spec)) task.upload( "variable_mgr.py", "/home/ubuntu/Dropbox/git0/benchmarks/scripts/tf_cnn_benchmarks/variable_mgr.py" ) cmds.append( ps_cmd_tmpl % { "worker_hosts": worker_hosts_str, "ps_hosts": ps_hosts_str, "job_name": task_type, "task_index": task.id }) task.run(' && '.join(cmds)) print("To see the output: tail -f %s" % (task.last_stdout)) # launch worker tasks task_type = 'worker' cmds = [] worker_cmd_tmpl = "python tf_cnn_benchmarks.py --data_format=NCHW --batch_size=64 --num_batches=1000 --model=resnet50 --optimizer=sgd --variable_update=distributed_replicated --cross_replica_sync=True --local_parameter_device=gpu --num_gpus=8 --nodistortions --display_every=10 --worker_hosts=%(worker_hosts)s --ps_hosts=%(ps_hosts)s --job_name=worker --task_index=%(task_index)s" for task in worker_job.tasks: cmds = [] task_spec = {'type': task_type, 'index': task.id} cmds.append(setup_cmd) cmds.append(tf_config_cmd(cluster_spec, task_spec)) task.upload( "variable_mgr.py", "/home/ubuntu/Dropbox/git0/benchmarks/scripts/tf_cnn_benchmarks/variable_mgr.py" ) cmds.append( worker_cmd_tmpl % { "worker_hosts": worker_hosts_str, "ps_hosts": ps_hosts_str, "job_name": task_type, "task_index": task.id }) task.run(' && '.join(cmds)) print("To see the output of %s: tail -f %s" % (task.id, task.last_stdout))
def test_reuse_job(): name = "testjob" job = aws.tf_job(name, 2)