def test_new_job(): name = "testjob" instances = toby_aws.LookupAwsInstances(instance_tag=name) assert not instances, "Instances already exist, kill them first" job = aws.tf_job(name, 2) instances = toby_aws.LookupAwsInstances(instance_tag=name) assert len(instances) == 2
def main(): FIRST_TIME = False if FIRST_TIME: with timeit('create_instances'): instances = cluster_aws.CreateAwsInstances(num_instances=2, image_id=AMI, key_name=KEY_NAME, ssh_key=KEY_FILE, security_group=SECURITY_GROUP, instance_tag=TAG, placement_group='', instance_type=INSTANCE_TYPE) else: instances = cluster_aws.LookupAwsInstances(instance_tag=TAG, ssh_key=KEY_FILE) # Exception connecting to host via ssh (could be a timeout): with timeit('connect'): instance = instances[0] instance.WaitUntilReady() def line_extractor(line): return True instance.ExecuteCommandAndStreamOutput('mkdir 43', stdout_file='/tmp/output') instance.ExecuteCommandAndStreamOutput('ls', stdout_file='/tmp/output') import pdb; pdb.set_trace()
def main(): localdir = args.localdir_prefix + '/' + args.name logdir = args.logdir_prefix + '/' + args.name os.system('rm -Rf ' + localdir) os.system('mkdir -p ' + localdir) # TODO: automatically decide whether to launch or connect to existing # TODO: implement killing if args.launch: print("Creating new instances") tags = {'iam': os.environ['USER']} with timeit('create_instances'): instances = cluster_aws.CreateAwsInstances( num_instances=1, image_id=args.ami, key_name=args.key_name, ssh_key=args.key_path, security_group=args.security_group, instance_tag=args.name, placement_group='', instance_type=args.instance_type, tags=tags) else: print("Reusing existing instances") instances = cluster_aws.LookupAwsInstances(instance_tag=args.name, ssh_key=args.key_path) assert len(instances) == 1, "%d instances found" % (len(instances), ) with timeit('connect'): for i, instance in enumerate(instances): print("Connecting to instance %d, %s" % (i, instance.instance_id)) instance.WaitUntilReady() instance = instances[0] # TODO: mount at /efs instead of ~/efs setup_cmd = """ sudo apt-get install nfs-common -y EFS_ID=fs-ab2b8102 EFS_REGION=us-west-2 sudo mkdir -p /efs sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $EFS_ID.efs.$EFS_REGION.amazonaws.com:/ /efs""" setup_cmds = setup_cmd.strip().split('\n') cmd = ' && '.join(setup_cmds) i = 0 fn_out = localdir + '/out-%02d' % (i, ) fn_err = localdir + '/err-%02d' % (i, ) print(cmd) def p(line): print(line) instance.ExecuteCommandAndStreamOutput(cmd, fn_out, fn_err, p)
def main(): num_instances = args.num_workers + args.num_ps os.system('rm -Rf '+args.logdir) os.system('mkdir -p '+args.logdir) # TODO: add these commands (running manually for now) # sudo nvidia-persistenced # sudo nvidia-smi --auto-boost-default=0 # sudo nvidia-smi -ac 2505,875 # p2 if args.launch: print("Creating new instances") with timeit('create_instances'): instances = cluster_aws.CreateAwsInstances(num_instances=num_instances, image_id=args.ami, key_name=args.key_name, ssh_key=args.key_path, security_group=args.security_group, instance_tag=args.tag, placement_group='', instance_type=args.instance_type) else: # TODO: better control of retrieved instances print("Reusing existing instances") instances = cluster_aws.LookupAwsInstances(instance_tag=args.tag, ssh_key=args.key_path) assert len(instances) >= num_instances # todo: deterministic worker sort with timeit('connect'): for i,instance in enumerate(instances): if i >= num_instances: break print("Connecting to instance %d, %s" % (i, instance.instance_id)) instance.WaitUntilReady() worker_instances = instances[:args.num_workers] ps_instances = instances[args.num_workers:args.num_workers+args.num_ps] instance_ip_map = get_instance_ip_map() worker_host_fragments = [] for instance in worker_instances: assert instance.instance_id in instance_ip_map worker_host_str = '%s:%d'%(instance_ip_map[instance.instance_id], args.port) worker_host_fragments.append(worker_host_str) worker_hosts = ','.join(worker_host_fragments) ps_host_fragments = [] for instance in ps_instances: assert instance.instance_id in instance_ip_map ps_host_str = '%s:%d'%(instance_ip_map[instance.instance_id], args.port) ps_host_fragments.append(ps_host_str) ps_hosts = ','.join(ps_host_fragments) line_extractor = util.ExtractErrorToConsole setup_cmd = "source ~/.bashrc && export PATH=~/anaconda3/bin:$PATH && source activate py2 && cd ~/git0/benchmarks/scripts/tf_cnn_benchmarks" worker_cmd_tmpl = "python tf_cnn_benchmarks.py --data_format=NCHW --batch_size=64 --num_batches=1000 --model=resnet50 --optimizer=sgd --variable_update=distributed_replicated --cross_replica_sync=True --local_parameter_device=gpu --num_gpus=1 --nodistortions --display_every=10 --worker_hosts=%(worker_hosts)s --ps_hosts=%(ps_hosts)s --job_name=worker --task_index=%(task_index)s" # job_name = 'worker' for i, instance in enumerate(worker_instances): worker_cmd = worker_cmd_tmpl % {'worker_hosts': worker_hosts, 'ps_hosts': ps_hosts, 'task_index': i} cmd = setup_cmd + " && " + worker_cmd # print(cmd) fn_out = args.logdir + '/worker_out-%02d'%(i,) fn_err = args.logdir + '/worker_err-%02d'%(i,) #ssh_client = instance.reuse_ssh_client() result = instance.ExecuteCommandInThread(cmd, stdout_file=fn_out, stderr_file=fn_err, line_extractor=line_extractor) print("worker %d started" %(i,)) ps_cmd_tmpl = "CUDA_VISIBLE_DEVICES='' python tf_cnn_benchmarks.py --local_parameter_device=gpu --worker_hosts=%(worker_hosts)s --ps_hosts=%(ps_hosts)s --job_name=ps --task_index=%(task_index)s" job_name = 'ps' for i, instance in enumerate(ps_instances): ps_cmd = ps_cmd_tmpl % {'worker_hosts': worker_hosts, 'ps_hosts': ps_hosts, 'task_index': i} cmd = setup_cmd + " && " + ps_cmd # print(cmd) fn_out = args.logdir + '/ps_out-%02d'%(i,) fn_err = args.logdir + '/ps_err-%02d'%(i,) #ssh_client = instance.reuse_ssh_client() result = instance.ExecuteCommandInThread(cmd, stdout_file=fn_out, stderr_file=fn_err, line_extractor=line_extractor) print("parameter server %d started " %(i,)) time.sleep(10000)