Esempio n. 1
0
def test_new_job():
    name = "testjob"
    instances = toby_aws.LookupAwsInstances(instance_tag=name)
    assert not instances, "Instances already exist, kill them first"

    job = aws.tf_job(name, 2)
    instances = toby_aws.LookupAwsInstances(instance_tag=name)
    assert len(instances) == 2
Esempio n. 2
0
def main():
  FIRST_TIME = False
  
  if FIRST_TIME:
    with timeit('create_instances'):
      instances = cluster_aws.CreateAwsInstances(num_instances=2,
                                                 image_id=AMI,
                                                 key_name=KEY_NAME,
                                                 ssh_key=KEY_FILE,
                                                 security_group=SECURITY_GROUP,
                                                 instance_tag=TAG,
                                                 placement_group='',
                                                 instance_type=INSTANCE_TYPE)
  else:
    instances = cluster_aws.LookupAwsInstances(instance_tag=TAG,
                                               ssh_key=KEY_FILE)
    #    Exception connecting to host via ssh (could be a timeout):


  
  with timeit('connect'):
    instance = instances[0]
    instance.WaitUntilReady()
    

  def line_extractor(line):
    return True
  
  instance.ExecuteCommandAndStreamOutput('mkdir 43',
                                         stdout_file='/tmp/output')
  instance.ExecuteCommandAndStreamOutput('ls', stdout_file='/tmp/output')

  import pdb; pdb.set_trace()
Esempio n. 3
0
def main():
    localdir = args.localdir_prefix + '/' + args.name
    logdir = args.logdir_prefix + '/' + args.name

    os.system('rm -Rf ' + localdir)
    os.system('mkdir -p ' + localdir)

    # TODO: automatically decide whether to launch or connect to existing
    # TODO: implement killing
    if args.launch:
        print("Creating new instances")
        tags = {'iam': os.environ['USER']}
        with timeit('create_instances'):
            instances = cluster_aws.CreateAwsInstances(
                num_instances=1,
                image_id=args.ami,
                key_name=args.key_name,
                ssh_key=args.key_path,
                security_group=args.security_group,
                instance_tag=args.name,
                placement_group='',
                instance_type=args.instance_type,
                tags=tags)
    else:
        print("Reusing existing instances")
        instances = cluster_aws.LookupAwsInstances(instance_tag=args.name,
                                                   ssh_key=args.key_path)
    assert len(instances) == 1, "%d instances found" % (len(instances), )

    with timeit('connect'):
        for i, instance in enumerate(instances):
            print("Connecting to instance %d, %s" % (i, instance.instance_id))
            instance.WaitUntilReady()

    instance = instances[0]

    # TODO: mount at /efs instead of ~/efs
    setup_cmd = """
sudo apt-get install nfs-common -y
EFS_ID=fs-ab2b8102
EFS_REGION=us-west-2
sudo mkdir -p /efs
sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $EFS_ID.efs.$EFS_REGION.amazonaws.com:/ /efs"""

    setup_cmds = setup_cmd.strip().split('\n')
    cmd = ' && '.join(setup_cmds)
    i = 0
    fn_out = localdir + '/out-%02d' % (i, )
    fn_err = localdir + '/err-%02d' % (i, )

    print(cmd)

    def p(line):
        print(line)

    instance.ExecuteCommandAndStreamOutput(cmd, fn_out, fn_err, p)
Esempio n. 4
0
def main():
  num_instances = args.num_workers + args.num_ps
  os.system('rm -Rf '+args.logdir)
  os.system('mkdir -p '+args.logdir)

  # TODO: add these commands (running manually for now)
  #  sudo nvidia-persistenced
  #  sudo nvidia-smi --auto-boost-default=0
  #  sudo nvidia-smi -ac 2505,875 # p2
  
  if args.launch:
    print("Creating new instances")
    with timeit('create_instances'):
      instances = cluster_aws.CreateAwsInstances(num_instances=num_instances,
                                                 image_id=args.ami,
                                                 key_name=args.key_name,
                                                 ssh_key=args.key_path,
                                                 security_group=args.security_group,
                                                 instance_tag=args.tag,
                                                 placement_group='',
                                                 instance_type=args.instance_type)
  else:
    # TODO: better control of retrieved instances
    print("Reusing existing instances")
    instances = cluster_aws.LookupAwsInstances(instance_tag=args.tag,
                                               ssh_key=args.key_path)
    assert len(instances) >= num_instances

  # todo: deterministic worker sort
  with timeit('connect'):
    for i,instance in enumerate(instances):
      if i >= num_instances:
        break
      print("Connecting to instance %d, %s" % (i, instance.instance_id))
      instance.WaitUntilReady()


  worker_instances = instances[:args.num_workers]
  ps_instances = instances[args.num_workers:args.num_workers+args.num_ps]

  instance_ip_map = get_instance_ip_map()
  
  worker_host_fragments = []
  for instance in worker_instances:
    assert instance.instance_id in instance_ip_map
    worker_host_str = '%s:%d'%(instance_ip_map[instance.instance_id], args.port)
    worker_host_fragments.append(worker_host_str)
  worker_hosts = ','.join(worker_host_fragments)
  
  ps_host_fragments = []
  for instance in ps_instances:
    assert instance.instance_id in instance_ip_map
    ps_host_str = '%s:%d'%(instance_ip_map[instance.instance_id], args.port)
    ps_host_fragments.append(ps_host_str)
  ps_hosts = ','.join(ps_host_fragments)


  line_extractor = util.ExtractErrorToConsole

  setup_cmd = "source ~/.bashrc && export PATH=~/anaconda3/bin:$PATH && source activate py2 && cd ~/git0/benchmarks/scripts/tf_cnn_benchmarks"
  
  worker_cmd_tmpl = "python tf_cnn_benchmarks.py --data_format=NCHW --batch_size=64 --num_batches=1000 --model=resnet50 --optimizer=sgd --variable_update=distributed_replicated --cross_replica_sync=True --local_parameter_device=gpu --num_gpus=1 --nodistortions --display_every=10 --worker_hosts=%(worker_hosts)s --ps_hosts=%(ps_hosts)s --job_name=worker --task_index=%(task_index)s"

  #  job_name = 'worker'
  for i, instance in enumerate(worker_instances):
    worker_cmd = worker_cmd_tmpl % {'worker_hosts': worker_hosts, 'ps_hosts': ps_hosts, 'task_index': i}
    cmd = setup_cmd + " && " + worker_cmd
    #    print(cmd)
    fn_out = args.logdir + '/worker_out-%02d'%(i,)
    fn_err = args.logdir + '/worker_err-%02d'%(i,)
    #ssh_client = instance.reuse_ssh_client()
    result = instance.ExecuteCommandInThread(cmd,
                                             stdout_file=fn_out,
                                             stderr_file=fn_err,
                                             line_extractor=line_extractor)
    print("worker %d started" %(i,))

  ps_cmd_tmpl = "CUDA_VISIBLE_DEVICES='' python tf_cnn_benchmarks.py --local_parameter_device=gpu --worker_hosts=%(worker_hosts)s --ps_hosts=%(ps_hosts)s --job_name=ps --task_index=%(task_index)s"
  job_name = 'ps'
  for i, instance in enumerate(ps_instances):
    ps_cmd = ps_cmd_tmpl % {'worker_hosts': worker_hosts, 'ps_hosts': ps_hosts,
                            'task_index': i}
    cmd = setup_cmd + " && " + ps_cmd
    #    print(cmd)
    fn_out = args.logdir + '/ps_out-%02d'%(i,)
    fn_err = args.logdir + '/ps_err-%02d'%(i,)
    #ssh_client = instance.reuse_ssh_client()
    result = instance.ExecuteCommandInThread(cmd,
                                             stdout_file=fn_out,
                                             stderr_file=fn_err,
                                             line_extractor=line_extractor)
    print("parameter server %d started " %(i,))

  time.sleep(10000)