def main(): localdir = args.localdir_prefix + '/' + args.name logdir = args.logdir_prefix + '/' + args.name os.system('rm -Rf ' + localdir) os.system('mkdir -p ' + localdir) # TODO: automatically decide whether to launch or connect to existing # TODO: implement killing if args.launch: print("Creating new instances") tags = {'iam': os.environ['USER']} with timeit('create_instances'): instances = cluster_aws.CreateAwsInstances( num_instances=1, image_id=args.ami, key_name=args.key_name, ssh_key=args.key_path, security_group=args.security_group, instance_tag=args.name, placement_group='', instance_type=args.instance_type, tags=tags) else: print("Reusing existing instances") instances = cluster_aws.LookupAwsInstances(instance_tag=args.name, ssh_key=args.key_path) assert len(instances) == 1, "%d instances found" % (len(instances), ) with timeit('connect'): for i, instance in enumerate(instances): print("Connecting to instance %d, %s" % (i, instance.instance_id)) instance.WaitUntilReady() instance = instances[0] # TODO: mount at /efs instead of ~/efs setup_cmd = """ sudo apt-get install nfs-common -y EFS_ID=fs-ab2b8102 EFS_REGION=us-west-2 sudo mkdir -p /efs sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $EFS_ID.efs.$EFS_REGION.amazonaws.com:/ /efs""" setup_cmds = setup_cmd.strip().split('\n') cmd = ' && '.join(setup_cmds) i = 0 fn_out = localdir + '/out-%02d' % (i, ) fn_err = localdir + '/err-%02d' % (i, ) print(cmd) def p(line): print(line) instance.ExecuteCommandAndStreamOutput(cmd, fn_out, fn_err, p)
def main(): localdir=args.localdir_prefix+'/'+args.name logdir=args.logdir_prefix+'/'+args.name os.system('rm -Rf '+localdir) os.system('mkdir -p '+localdir) # TODO: automatically decide whether to launch or connect to existing # TODO: implement killing if args.launch: print("Creating new instances") tags = {'iam': os.environ['USER']} with timeit('create_instances'): instances = cluster_aws.CreateAwsInstances(num_instances=1, image_id=args.ami, key_name=args.key_name, ssh_key=args.key_path, security_group=args.security_group, instance_tag=args.name, placement_group='', instance_type=args.instance_type, tags=tags) else: print("Reusing existing instances") instances = cluster_aws.LookupAwsInstances(instance_tag=args.name, ssh_key=args.key_path) assert len(instances) == 1, "%d instances found" % (len(instances),) with timeit('connect'): for i,instance in enumerate(instances): print("Connecting to instance %d, %s" % (i, instance.instance_id)) instance.WaitUntilReady() instance = instances[0] # TODO: mount at /efs instead of ~/efs setup_cmd = """ sudo apt-get install nfs-common -y EFS_ID=fs-ab2b8102 EFS_REGION=us-west-2 sudo mkdir -p /efs sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $EFS_ID.efs.$EFS_REGION.amazonaws.com:/ /efs""" setup_cmds = setup_cmd.strip().split('\n') cmd = ' && '.join(setup_cmds) i = 0 fn_out = localdir + '/out-%02d'%(i,) fn_err = localdir + '/err-%02d'%(i,) print(cmd) def p(line): print(line) instance.ExecuteCommandAndStreamOutput(cmd, fn_out, fn_err, p)
def create_session(): # uninited_list = ['somevariable'] is_initialized = False while not is_initialized: try: with timeit("session creation"): sess = tf.InteractiveSession(server.target, config=session_config()) with timeit("sessrun"): # uninited_list = sessrun(uninitialized_op) is_initialized = sessrun(initialized_op) except Exception as e: print("Initialization failed with %s, retrying" %(e,)) print(("Model not initialized, " "retrying in %.1f seconds" %(RETRY_DELAY_SEC,))) time.sleep(RETRY_DELAY_SEC) return sess
def create_session(): # uninited_list = ['somevariable'] is_initialized = False while not is_initialized: try: with timeit("session creation"): sess = tf.InteractiveSession(server.target, config=session_config()) with timeit("sessrun"): # uninited_list = sessrun(uninitialized_op) is_initialized = sessrun(initialized_op) except Exception as e: print("Initialization failed with %s, retrying" % (e, )) print(("Model not initialized, " "retrying in %.1f seconds" % (RETRY_DELAY_SEC, ))) time.sleep(RETRY_DELAY_SEC) return sess
def run_ps(): config = load_config() assert config.task_type == 'ps' params = make_params() with timeit('create server'): print("Starting server with target %s"%(config.cluster_spec[config.task_type][config.task_id])) server = tf.train.Server(config.cluster_spec, config=session_config(), job_name=config.task_type, task_index=config.task_id) # doing init run from ps master fails with # sess run failed with No worker known as /job:worker/replica:0/task:1 # [[Node: Fill_S3 = _Recv[client_terminated=false, recv_device="/job:ps/replica:0/task:0/device:CPU:0", send_device="/job:worker/replica:0/task:1/device:CPU:0", send_device_incarnation=7403937842608207616, tensor_name="edge_3_Fill", tensor_type=DT_INT32, _device="/job:ps/replica:0/task:0/device:CPU:0"]()]], retrying in 5.0 seconds # todo: replace with dequeue for graceful shutdown # todo: done_queue from sharded_ps_benchmark # done_queue = create_done_queue(0) time.sleep(365*24*3600)
def run_ps(): config = load_config() assert config.task_type == 'ps' params = make_params() with timeit('create server'): print("Starting server with target %s" % (config.cluster_spec[config.task_type][config.task_id])) server = tf.train.Server(config.cluster_spec, config=session_config(), job_name=config.task_type, task_index=config.task_id) # doing init run from ps master fails with # sess run failed with No worker known as /job:worker/replica:0/task:1 # [[Node: Fill_S3 = _Recv[client_terminated=false, recv_device="/job:ps/replica:0/task:0/device:CPU:0", send_device="/job:worker/replica:0/task:1/device:CPU:0", send_device_incarnation=7403937842608207616, tensor_name="edge_3_Fill", tensor_type=DT_INT32, _device="/job:ps/replica:0/task:0/device:CPU:0"]()]], retrying in 5.0 seconds # todo: replace with dequeue for graceful shutdown # todo: done_queue from sharded_ps_benchmark # done_queue = create_done_queue(0) time.sleep(365 * 24 * 3600)
def run_worker(): """Main worker loop.""" # todo: rename "config" into distributed_config config = load_config() cluster_spec = config.cluster_spec # import pdb; pdb.set_trace() ps_tasks = len(cluster_spec['ps']) assert ps_tasks >= 0 # returns device like /job:worker/task:0 worker_device = '' assert config.task_type == 'worker' if config.task_id == 1: time.sleep(60) # slow-down second worker worker_device = get_worker_device(config.task_id) ps_device = get_ps_device(0) # todo: replace with int64 # todo: replace with varscope.getvariable like in alextp suggestion with timeit("worker graph create"): params = make_params() with tf.device(worker_device): val = tf.ones((), dtype=params.dtype) grads = tf.fill([params.shape[0]], val) # todo: add two-way communication with tf.device(ps_device): update = params.assign_add(grads) params0 = params[0] #uninitialized_op = tf.report_uninitialized_variables() initialized_op = tf.is_variable_initialized(params) # todo: check how estimator does it # TODO: retries for errors during server creation? # it can fail if assigned port is unavailable with timeit("worker server start"): server = tf.train.Server(cluster_spec, config=session_config(), job_name=config.task_type, task_index=config.task_id) # follow logic in prepare_session # https://github.com/tensorflow/tensorflow/blob/22586bdf900640217deac6dc826054bc6e785518/tensorflow/python/training/session_manager.py#L71 def create_session(): # uninited_list = ['somevariable'] is_initialized = False while not is_initialized: try: with timeit("session creation"): sess = tf.InteractiveSession(server.target, config=session_config()) with timeit("sessrun"): # uninited_list = sessrun(uninitialized_op) is_initialized = sessrun(initialized_op) except Exception as e: print("Initialization failed with %s, retrying" %(e,)) print(("Model not initialized, " "retrying in %.1f seconds" %(RETRY_DELAY_SEC,))) time.sleep(RETRY_DELAY_SEC) return sess # are there failures in creating session with timeit('create session'): sess = tf.InteractiveSession(server.target, config=session_config()) # only run initialization on worker task 0 if config.task_id == 0: sess_run_succeeded = False while not sess_run_succeeded: try: with timeit('intialize vars'): sessrun(params.initializer) sess_run_succeeded = True except Exception as e: print("Initialization failed with %s, retrying " "in %.1f sec" %(e, RETRY_DELAY_SEC)) # this can fail if workers too too long to come up and # sessrun failed with DeadlineExceeded time.sleep(RETRY_DELAY_SEC) for step in range(FLAGS.iters): start_time = time.time() for i in range(FLAGS.iters_per_step): sess_run_succeeded = False while not sess_run_succeeded: try: sessrun(update) sess_run_succeeded = True # Exception when ps restarts, need to recreate session except Exception as e: print(("sess run failed with %s, " "retrying in %.1f seconds" %(e, RETRY_DELAY_SEC,))) time.sleep(RETRY_DELAY_SEC) sess = create_session() elapsed_time = time.time() - start_time rate = float(FLAGS.iters_per_step)*FLAGS.data_mb/elapsed_time event = write_event('rate', rate, step) print('%.2f MB/s'%(rate,))
def run_worker(): """Main worker loop.""" # todo: rename "config" into distributed_config config = load_config() cluster_spec = config.cluster_spec # import pdb; pdb.set_trace() ps_tasks = len(cluster_spec['ps']) assert ps_tasks >= 0 # returns device like /job:worker/task:0 worker_device = '' assert config.task_type == 'worker' if config.task_id == 1: time.sleep(60) # slow-down second worker worker_device = get_worker_device(config.task_id) ps_device = get_ps_device(0) # todo: replace with int64 # todo: replace with varscope.getvariable like in alextp suggestion with timeit("worker graph create"): params = make_params() with tf.device(worker_device): val = tf.ones((), dtype=params.dtype) grads = tf.fill([params.shape[0]], val) # todo: add two-way communication with tf.device(ps_device): update = params.assign_add(grads) params0 = params[0] #uninitialized_op = tf.report_uninitialized_variables() initialized_op = tf.is_variable_initialized(params) # todo: check how estimator does it # TODO: retries for errors during server creation? # it can fail if assigned port is unavailable with timeit("worker server start"): server = tf.train.Server(cluster_spec, config=session_config(), job_name=config.task_type, task_index=config.task_id) # follow logic in prepare_session # https://github.com/tensorflow/tensorflow/blob/22586bdf900640217deac6dc826054bc6e785518/tensorflow/python/training/session_manager.py#L71 def create_session(): # uninited_list = ['somevariable'] is_initialized = False while not is_initialized: try: with timeit("session creation"): sess = tf.InteractiveSession(server.target, config=session_config()) with timeit("sessrun"): # uninited_list = sessrun(uninitialized_op) is_initialized = sessrun(initialized_op) except Exception as e: print("Initialization failed with %s, retrying" % (e, )) print(("Model not initialized, " "retrying in %.1f seconds" % (RETRY_DELAY_SEC, ))) time.sleep(RETRY_DELAY_SEC) return sess # are there failures in creating session with timeit('create session'): sess = tf.InteractiveSession(server.target, config=session_config()) # only run initialization on worker task 0 if config.task_id == 0: sess_run_succeeded = False while not sess_run_succeeded: try: with timeit('intialize vars'): sessrun(params.initializer) sess_run_succeeded = True except Exception as e: print("Initialization failed with %s, retrying " "in %.1f sec" % (e, RETRY_DELAY_SEC)) # this can fail if workers too too long to come up and # sessrun failed with DeadlineExceeded time.sleep(RETRY_DELAY_SEC) for step in range(FLAGS.iters): start_time = time.time() for i in range(FLAGS.iters_per_step): sess_run_succeeded = False while not sess_run_succeeded: try: sessrun(update) sess_run_succeeded = True # Exception when ps restarts, need to recreate session except Exception as e: print(("sess run failed with %s, " "retrying in %.1f seconds" % ( e, RETRY_DELAY_SEC, ))) time.sleep(RETRY_DELAY_SEC) sess = create_session() elapsed_time = time.time() - start_time rate = float(FLAGS.iters_per_step) * FLAGS.data_mb / elapsed_time event = write_event('rate', rate, step) print('%.2f MB/s' % (rate, ))