Exemple #1
0
def main():
    localdir = args.localdir_prefix + '/' + args.name
    logdir = args.logdir_prefix + '/' + args.name

    os.system('rm -Rf ' + localdir)
    os.system('mkdir -p ' + localdir)

    # TODO: automatically decide whether to launch or connect to existing
    # TODO: implement killing
    if args.launch:
        print("Creating new instances")
        tags = {'iam': os.environ['USER']}
        with timeit('create_instances'):
            instances = cluster_aws.CreateAwsInstances(
                num_instances=1,
                image_id=args.ami,
                key_name=args.key_name,
                ssh_key=args.key_path,
                security_group=args.security_group,
                instance_tag=args.name,
                placement_group='',
                instance_type=args.instance_type,
                tags=tags)
    else:
        print("Reusing existing instances")
        instances = cluster_aws.LookupAwsInstances(instance_tag=args.name,
                                                   ssh_key=args.key_path)
    assert len(instances) == 1, "%d instances found" % (len(instances), )

    with timeit('connect'):
        for i, instance in enumerate(instances):
            print("Connecting to instance %d, %s" % (i, instance.instance_id))
            instance.WaitUntilReady()

    instance = instances[0]

    # TODO: mount at /efs instead of ~/efs
    setup_cmd = """
sudo apt-get install nfs-common -y
EFS_ID=fs-ab2b8102
EFS_REGION=us-west-2
sudo mkdir -p /efs
sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $EFS_ID.efs.$EFS_REGION.amazonaws.com:/ /efs"""

    setup_cmds = setup_cmd.strip().split('\n')
    cmd = ' && '.join(setup_cmds)
    i = 0
    fn_out = localdir + '/out-%02d' % (i, )
    fn_err = localdir + '/err-%02d' % (i, )

    print(cmd)

    def p(line):
        print(line)

    instance.ExecuteCommandAndStreamOutput(cmd, fn_out, fn_err, p)
Exemple #2
0
def main():
  localdir=args.localdir_prefix+'/'+args.name
  logdir=args.logdir_prefix+'/'+args.name
    
  os.system('rm -Rf '+localdir)
  os.system('mkdir -p '+localdir)

  # TODO: automatically decide whether to launch or connect to existing
  # TODO: implement killing
  if args.launch:
    print("Creating new instances")
    tags = {'iam': os.environ['USER']}
    with timeit('create_instances'):
      instances = cluster_aws.CreateAwsInstances(num_instances=1,
                                                 image_id=args.ami,
                                                 key_name=args.key_name,
                                                 ssh_key=args.key_path,
                                                 security_group=args.security_group,
                                                 instance_tag=args.name,
                                                 placement_group='',
                                                 instance_type=args.instance_type,
                                                 tags=tags)
  else:
    print("Reusing existing instances")
    instances = cluster_aws.LookupAwsInstances(instance_tag=args.name,
                                               ssh_key=args.key_path)
  assert len(instances) == 1, "%d instances found" % (len(instances),)

  with timeit('connect'):
    for i,instance in enumerate(instances):
      print("Connecting to instance %d, %s" % (i, instance.instance_id))
      instance.WaitUntilReady()

  instance = instances[0]

  # TODO: mount at /efs instead of ~/efs
  setup_cmd = """
sudo apt-get install nfs-common -y
EFS_ID=fs-ab2b8102
EFS_REGION=us-west-2
sudo mkdir -p /efs
sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $EFS_ID.efs.$EFS_REGION.amazonaws.com:/ /efs"""

  setup_cmds = setup_cmd.strip().split('\n')
  cmd = ' && '.join(setup_cmds)
  i = 0
  fn_out = localdir + '/out-%02d'%(i,)
  fn_err = localdir + '/err-%02d'%(i,)

  print(cmd)
  def p(line): print(line)
  instance.ExecuteCommandAndStreamOutput(cmd, fn_out, fn_err, p)
Exemple #3
0
 def create_session():
   #    uninited_list = ['somevariable']
   is_initialized = False
   while not is_initialized:
     try:
       with timeit("session creation"):
         sess = tf.InteractiveSession(server.target, config=session_config())
       with timeit("sessrun"):
         #          uninited_list = sessrun(uninitialized_op)
         is_initialized = sessrun(initialized_op)
     except Exception as e:
       print("Initialization failed with %s, retrying" %(e,))
     print(("Model not initialized, "
            "retrying in %.1f seconds" %(RETRY_DELAY_SEC,)))
     time.sleep(RETRY_DELAY_SEC)
   return sess
Exemple #4
0
 def create_session():
     #    uninited_list = ['somevariable']
     is_initialized = False
     while not is_initialized:
         try:
             with timeit("session creation"):
                 sess = tf.InteractiveSession(server.target,
                                              config=session_config())
             with timeit("sessrun"):
                 #          uninited_list = sessrun(uninitialized_op)
                 is_initialized = sessrun(initialized_op)
         except Exception as e:
             print("Initialization failed with %s, retrying" % (e, ))
         print(("Model not initialized, "
                "retrying in %.1f seconds" % (RETRY_DELAY_SEC, )))
         time.sleep(RETRY_DELAY_SEC)
     return sess
Exemple #5
0
def run_ps():
  config = load_config()
  
  assert config.task_type == 'ps'
  params = make_params()
  
  with timeit('create server'):
    print("Starting server with target %s"%(config.cluster_spec[config.task_type][config.task_id]))
    server = tf.train.Server(config.cluster_spec, config=session_config(),
                             job_name=config.task_type,
                             task_index=config.task_id)

  # doing init run from ps master fails with
  # sess run failed with No worker known as /job:worker/replica:0/task:1
  #      [[Node: Fill_S3 = _Recv[client_terminated=false, recv_device="/job:ps/replica:0/task:0/device:CPU:0", send_device="/job:worker/replica:0/task:1/device:CPU:0", send_device_incarnation=7403937842608207616, tensor_name="edge_3_Fill", tensor_type=DT_INT32, _device="/job:ps/replica:0/task:0/device:CPU:0"]()]], retrying in 5.0 seconds

  # todo: replace with dequeue for graceful shutdown
  # todo: done_queue from sharded_ps_benchmark
  # done_queue = create_done_queue(0)
  time.sleep(365*24*3600)
Exemple #6
0
def run_ps():
    config = load_config()

    assert config.task_type == 'ps'
    params = make_params()

    with timeit('create server'):
        print("Starting server with target %s" %
              (config.cluster_spec[config.task_type][config.task_id]))
        server = tf.train.Server(config.cluster_spec,
                                 config=session_config(),
                                 job_name=config.task_type,
                                 task_index=config.task_id)

    # doing init run from ps master fails with
    # sess run failed with No worker known as /job:worker/replica:0/task:1
    #      [[Node: Fill_S3 = _Recv[client_terminated=false, recv_device="/job:ps/replica:0/task:0/device:CPU:0", send_device="/job:worker/replica:0/task:1/device:CPU:0", send_device_incarnation=7403937842608207616, tensor_name="edge_3_Fill", tensor_type=DT_INT32, _device="/job:ps/replica:0/task:0/device:CPU:0"]()]], retrying in 5.0 seconds

    # todo: replace with dequeue for graceful shutdown
    # todo: done_queue from sharded_ps_benchmark
    # done_queue = create_done_queue(0)
    time.sleep(365 * 24 * 3600)
Exemple #7
0
def run_worker():
  """Main worker loop."""

  # todo: rename "config" into distributed_config
  config = load_config()
  cluster_spec = config.cluster_spec
  #  import pdb; pdb.set_trace()

  ps_tasks = len(cluster_spec['ps'])
  assert ps_tasks >= 0

  # returns device like /job:worker/task:0
  worker_device = ''
  assert config.task_type == 'worker'
  
  if config.task_id == 1:
    time.sleep(60)  # slow-down second worker
  
  worker_device = get_worker_device(config.task_id)

  ps_device = get_ps_device(0)

  # todo: replace with int64
  # todo: replace with varscope.getvariable like in alextp suggestion
  with timeit("worker graph create"):
    params = make_params()
    with tf.device(worker_device):
      val = tf.ones((), dtype=params.dtype)
      grads = tf.fill([params.shape[0]], val)
      # todo: add two-way communication

    with tf.device(ps_device):
      update = params.assign_add(grads)
      params0 = params[0]

    #uninitialized_op = tf.report_uninitialized_variables()
    initialized_op = tf.is_variable_initialized(params)
  
  # todo: check how estimator does it
  # TODO: retries for errors during server creation?
  # it can fail if assigned port is unavailable
  with timeit("worker server start"):
    server = tf.train.Server(cluster_spec, config=session_config(),
                             job_name=config.task_type,
                             task_index=config.task_id)

    # follow logic in prepare_session
    # https://github.com/tensorflow/tensorflow/blob/22586bdf900640217deac6dc826054bc6e785518/tensorflow/python/training/session_manager.py#L71

  def create_session():
    #    uninited_list = ['somevariable']
    is_initialized = False
    while not is_initialized:
      try:
        with timeit("session creation"):
          sess = tf.InteractiveSession(server.target, config=session_config())
        with timeit("sessrun"):
          #          uninited_list = sessrun(uninitialized_op)
          is_initialized = sessrun(initialized_op)
      except Exception as e:
        print("Initialization failed with %s, retrying" %(e,))
      print(("Model not initialized, "
             "retrying in %.1f seconds" %(RETRY_DELAY_SEC,)))
      time.sleep(RETRY_DELAY_SEC)
    return sess
    
  # are there failures in creating session
  with timeit('create session'):
    sess = tf.InteractiveSession(server.target, config=session_config())
    
  # only run initialization on worker task 0
  if config.task_id == 0:
    sess_run_succeeded = False
    while not sess_run_succeeded:
      try:
        with timeit('intialize vars'):
          sessrun(params.initializer)
          sess_run_succeeded = True
      except Exception as e:
        print("Initialization failed with %s, retrying "
              "in %.1f sec" %(e, RETRY_DELAY_SEC))
        # this can fail if workers too too long to come up and
        # sessrun failed with DeadlineExceeded
        time.sleep(RETRY_DELAY_SEC)
    

  for step in range(FLAGS.iters):
    start_time = time.time()
    for i in range(FLAGS.iters_per_step):
      sess_run_succeeded = False
      while not sess_run_succeeded:
        try:
          sessrun(update)
          sess_run_succeeded = True
        # Exception when ps restarts, need to recreate session
        except Exception as e:  
          print(("sess run failed with %s, "
                 "retrying in %.1f seconds" %(e, RETRY_DELAY_SEC,)))
          time.sleep(RETRY_DELAY_SEC)
          sess = create_session()

    elapsed_time = time.time() - start_time
    rate = float(FLAGS.iters_per_step)*FLAGS.data_mb/elapsed_time
    event = write_event('rate', rate, step)
    print('%.2f MB/s'%(rate,))
Exemple #8
0
def run_worker():
    """Main worker loop."""

    # todo: rename "config" into distributed_config
    config = load_config()
    cluster_spec = config.cluster_spec
    #  import pdb; pdb.set_trace()

    ps_tasks = len(cluster_spec['ps'])
    assert ps_tasks >= 0

    # returns device like /job:worker/task:0
    worker_device = ''
    assert config.task_type == 'worker'

    if config.task_id == 1:
        time.sleep(60)  # slow-down second worker

    worker_device = get_worker_device(config.task_id)

    ps_device = get_ps_device(0)

    # todo: replace with int64
    # todo: replace with varscope.getvariable like in alextp suggestion
    with timeit("worker graph create"):
        params = make_params()
        with tf.device(worker_device):
            val = tf.ones((), dtype=params.dtype)
            grads = tf.fill([params.shape[0]], val)
            # todo: add two-way communication

        with tf.device(ps_device):
            update = params.assign_add(grads)
            params0 = params[0]

        #uninitialized_op = tf.report_uninitialized_variables()
        initialized_op = tf.is_variable_initialized(params)

    # todo: check how estimator does it
    # TODO: retries for errors during server creation?
    # it can fail if assigned port is unavailable
    with timeit("worker server start"):
        server = tf.train.Server(cluster_spec,
                                 config=session_config(),
                                 job_name=config.task_type,
                                 task_index=config.task_id)

        # follow logic in prepare_session
        # https://github.com/tensorflow/tensorflow/blob/22586bdf900640217deac6dc826054bc6e785518/tensorflow/python/training/session_manager.py#L71

    def create_session():
        #    uninited_list = ['somevariable']
        is_initialized = False
        while not is_initialized:
            try:
                with timeit("session creation"):
                    sess = tf.InteractiveSession(server.target,
                                                 config=session_config())
                with timeit("sessrun"):
                    #          uninited_list = sessrun(uninitialized_op)
                    is_initialized = sessrun(initialized_op)
            except Exception as e:
                print("Initialization failed with %s, retrying" % (e, ))
            print(("Model not initialized, "
                   "retrying in %.1f seconds" % (RETRY_DELAY_SEC, )))
            time.sleep(RETRY_DELAY_SEC)
        return sess

    # are there failures in creating session
    with timeit('create session'):
        sess = tf.InteractiveSession(server.target, config=session_config())

    # only run initialization on worker task 0
    if config.task_id == 0:
        sess_run_succeeded = False
        while not sess_run_succeeded:
            try:
                with timeit('intialize vars'):
                    sessrun(params.initializer)
                    sess_run_succeeded = True
            except Exception as e:
                print("Initialization failed with %s, retrying "
                      "in %.1f sec" % (e, RETRY_DELAY_SEC))
                # this can fail if workers too too long to come up and
                # sessrun failed with DeadlineExceeded
                time.sleep(RETRY_DELAY_SEC)

    for step in range(FLAGS.iters):
        start_time = time.time()
        for i in range(FLAGS.iters_per_step):
            sess_run_succeeded = False
            while not sess_run_succeeded:
                try:
                    sessrun(update)
                    sess_run_succeeded = True
                # Exception when ps restarts, need to recreate session
                except Exception as e:
                    print(("sess run failed with %s, "
                           "retrying in %.1f seconds" % (
                               e,
                               RETRY_DELAY_SEC,
                           )))
                    time.sleep(RETRY_DELAY_SEC)
                    sess = create_session()

        elapsed_time = time.time() - start_time
        rate = float(FLAGS.iters_per_step) * FLAGS.data_mb / elapsed_time
        event = write_event('rate', rate, step)
        print('%.2f MB/s' % (rate, ))