def test(action_set, level_names): """Test.""" level_returns = {level_name: [] for level_name in level_names} with tf.Graph().as_default(): agent = Agent(len(action_set)) outputs = {} for level_name in level_names: env = create_environment(level_name, seed=1, is_test=True) outputs[level_name] = build_actor(agent, env, level_name, action_set) with tf.train.SingularMonitoredSession( checkpoint_dir=FLAGS.logdir, hooks=[py_process.PyProcessHook()]) as session: for level_name in level_names: tf.logging.info('Testing level: %s', level_name) while True: done_v, infos_v = session.run(( outputs[level_name].env_outputs.done, outputs[level_name].env_outputs.info )) returns = level_returns[level_name] returns.extend(infos_v.episode_return[1:][done_v[1:]]) if len(returns) >= FLAGS.test_num_episodes: tf.logging.info('Mean episode return: %f', np.mean(returns)) break if FLAGS.level_name == 'dmlab30': no_cap = dmlab30.compute_human_normalized_score(level_returns, per_level_cap=None) cap_100 = dmlab30.compute_human_normalized_score(level_returns, per_level_cap=100) tf.logging.info('No cap.: %f Cap 100: %f', no_cap, cap_100)
def test(action_set, level_names): """Test.""" level_returns = {level_name: [] for level_name in level_names} with tf.Graph().as_default(): agent = Agent(len(action_set)) outputs = {} for level_name in level_names: env = create_environment(level_name, seed=1, is_test=True) outputs[level_name] = build_actor(agent, env, level_name, action_set) with tf.train.SingularMonitoredSession( checkpoint_dir=FLAGS.logdir, hooks=[py_process.PyProcessHook()]) as session: for level_name in level_names: tf.logging.info('Testing level: %s', level_name) while True: done_v, infos_v = session.run(( outputs[level_name].env_outputs.done, outputs[level_name].env_outputs.info )) returns = level_returns[level_name] returns.extend(infos_v.episode_return[1:][done_v[1:]]) if len(returns) >= FLAGS.test_num_episodes: tf.logging.info('Mean episode return: %f', np.mean(returns)) break if FLAGS.level_name == 'dmlab30': no_cap = dmlab30.compute_human_normalized_score(level_returns, per_level_cap=None) cap_100 = dmlab30.compute_human_normalized_score(level_returns, per_level_cap=100) tf.logging.info('No cap.: %f Cap 100: %f', no_cap, cap_100)
def train(action_set, level_names): """Train.""" if is_single_machine(): local_job_device = '' shared_job_device = '' is_actor_fn = lambda i: True is_learner = True global_variable_device = '/gpu' server = tf.train.Server.create_local_server() filters = [] else: local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task) shared_job_device = '/job:learner/task:0' is_actor_fn = lambda i: FLAGS.job_name == 'actor' and i == FLAGS.task is_learner = FLAGS.job_name == 'learner' # Placing the variable on CPU, makes it cheaper to send it to all the # actors. Continual copying the variables from the GPU is slow. global_variable_device = shared_job_device + '/cpu' cluster = tf.train.ClusterSpec({ 'actor': ['localhost:%d' % (8001 + i) for i in range(FLAGS.num_actors)], 'learner': ['localhost:8000'] }) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task) filters = [shared_job_device, local_job_device] # Only used to find the actor output structure. with tf.Graph().as_default(): agent = Agent(len(action_set)) env = create_environment(level_names[0], seed=1) structure = build_actor(agent, env, level_names[0], action_set) flattened_structure = nest.flatten(structure) dtypes = [t.dtype for t in flattened_structure] shapes = [t.shape.as_list() for t in flattened_structure] with tf.Graph().as_default(), \ tf.device(local_job_device + '/cpu'), \ pin_global_variables(global_variable_device): tf.set_random_seed(FLAGS.seed) # Makes initialization deterministic. # Create Queue and Agent on the learner. with tf.device(shared_job_device): queue = tf.FIFOQueue(1, dtypes, shapes, shared_name='buffer') agent = Agent(len(action_set)) if is_single_machine() and 'dynamic_batching' in sys.modules: # For single machine training, we use dynamic batching for improved GPU # utilization. The semantics of single machine training are slightly # different from the distributed setting because within a single unroll # of an environment, the actions may be computed using different weights # if an update happens within the unroll. old_build = agent._build @dynamic_batching.batch_fn def build(*args): with tf.device('/gpu'): return old_build(*args) tf.logging.info('Using dynamic batching.') agent._build = build # Build actors and ops to enqueue their output. enqueue_ops = [] for i in range(FLAGS.num_actors): if is_actor_fn(i): level_name = level_names[i % len(level_names)] tf.logging.info('Creating actor %d with level %s', i, level_name) env = create_environment(level_name, seed=i + 1) actor_output = build_actor(agent, env, level_name, action_set) with tf.device(shared_job_device): enqueue_ops.append(queue.enqueue(nest.flatten(actor_output))) # If running in a single machine setup, run actors with QueueRunners # (separate threads). if is_learner and enqueue_ops: tf.train.add_queue_runner(tf.train.QueueRunner(queue, enqueue_ops)) # Build learner. if is_learner: # Create global step, which is the number of environment frames processed. tf.get_variable( 'num_environment_frames', initializer=tf.zeros_initializer(), shape=[], dtype=tf.int64, trainable=False, collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES]) # Create batch (time major) and recreate structure. dequeued = queue.dequeue_many(FLAGS.batch_size) dequeued = nest.pack_sequence_as(structure, dequeued) def make_time_major(s): return nest.map_structure( lambda t: tf.transpose(t, [1, 0] + list(range(t.shape.ndims))[2:]), s) dequeued = dequeued._replace( env_outputs=make_time_major(dequeued.env_outputs), agent_outputs=make_time_major(dequeued.agent_outputs)) with tf.device('/gpu'): # Using StagingArea allows us to prepare the next batch and send it to # the GPU while we're performing a training step. This adds up to 1 step # policy lag. flattened_output = nest.flatten(dequeued) area = tf.contrib.staging.StagingArea( [t.dtype for t in flattened_output], [t.shape for t in flattened_output]) stage_op = area.put(flattened_output) data_from_actors = nest.pack_sequence_as(structure, area.get()) # Unroll agent on sequence, create losses and update ops. output = build_learner(agent, data_from_actors.agent_state, data_from_actors.env_outputs, data_from_actors.agent_outputs) # Create MonitoredSession (to run the graph, checkpoint and log). tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner) config = tf.ConfigProto(allow_soft_placement=True, device_filters=filters) with tf.train.MonitoredTrainingSession( server.target, is_chief=is_learner, checkpoint_dir=FLAGS.logdir, save_checkpoint_secs=600, save_summaries_secs=30, log_step_count_steps=50000, config=config, hooks=[py_process.PyProcessHook()]) as session: if is_learner: # Logging. level_returns = {level_name: [] for level_name in level_names} summary_writer = tf.summary.FileWriterCache.get(FLAGS.logdir) # Prepare data for first run. session.run_step_fn( lambda step_context: step_context.session.run(stage_op)) # Execute learning and track performance. num_env_frames_v = 0 while num_env_frames_v < FLAGS.total_environment_frames: level_names_v, done_v, infos_v, num_env_frames_v, _ = session.run( (data_from_actors.level_name,) + output + (stage_op,)) level_names_v = np.repeat([level_names_v], done_v.shape[0], 0) for level_name, episode_return, episode_step in zip( level_names_v[done_v], infos_v.episode_return[done_v], infos_v.episode_step[done_v]): episode_frames = episode_step * FLAGS.num_action_repeats tf.logging.info('Level: %s Episode return: %f', level_name, episode_return) summary = tf.summary.Summary() summary.value.add(tag=level_name + '/episode_return', simple_value=episode_return) summary.value.add(tag=level_name + '/episode_frames', simple_value=episode_frames) summary_writer.add_summary(summary, num_env_frames_v) if FLAGS.level_name == 'dmlab30': level_returns[level_name].append(episode_return) print("(experiment.py) level_returns: ", level_returns) if (FLAGS.level_name == 'dmlab30' and min(map(len, level_returns.values())) >= 1): no_cap = dmlab30.compute_human_normalized_score(level_returns, per_level_cap=None) # print("(experiment) No cap: ", no_cap) cap_100 = dmlab30.compute_human_normalized_score(level_returns, per_level_cap=100) with open("normalized_scores.txt", "a+") as f: f.write("num env frames: %d\n" % num_env_frames_v) f.write("no cap: %f\n" % no_cap) f.write("cap 100: %f\n" % cap_100) summary = tf.summary.Summary() summary.value.add( tag='dmlab30/training_no_cap', simple_value=no_cap) summary.value.add( tag='dmlab30/training_cap_100', simple_value=cap_100) summary_writer.add_summary(summary, num_env_frames_v) # Clear level scores. level_returns = {level_name: [] for level_name in level_names} else: # Execute actors (they just need to enqueue their output). while True: session.run(enqueue_ops)
def train(action_set, level_names): """Train.""" if is_single_machine(): local_job_device = '' shared_job_device = '' is_actor_fn = lambda i: True is_learner = True global_variable_device = '/gpu' server = tf.train.Server.create_local_server() filters = [] else: local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task) shared_job_device = '/job:learner/task:0' is_actor_fn = lambda i: FLAGS.job_name == 'actor' and i == FLAGS.task is_learner = FLAGS.job_name == 'learner' # Placing the variable on CPU, makes it cheaper to send it to all the # actors. Continual copying the variables from the GPU is slow. global_variable_device = shared_job_device + '/cpu' cluster = tf.train.ClusterSpec({ 'actor': ['localhost:%d' % (8001 + i) for i in range(FLAGS.num_actors)], 'learner': ['localhost:8000'] }) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task) filters = [shared_job_device, local_job_device] # Only used to find the actor output structure. with tf.Graph().as_default(): agent = Agent(len(action_set)) env = create_environment(level_names[0], seed=1) structure = build_actor(agent, env, level_names[0], action_set) flattened_structure = nest.flatten(structure) dtypes = [t.dtype for t in flattened_structure] shapes = [t.shape.as_list() for t in flattened_structure] with tf.Graph().as_default(), \ tf.device(local_job_device + '/cpu'), \ pin_global_variables(global_variable_device): tf.set_random_seed(FLAGS.seed) # Makes initialization deterministic. # Create Queue and Agent on the learner. with tf.device(shared_job_device): queue = tf.FIFOQueue(1, dtypes, shapes, shared_name='buffer') agent = Agent(len(action_set)) if is_single_machine() and 'dynamic_batching' in sys.modules: # For single machine training, we use dynamic batching for improved GPU # utilization. The semantics of single machine training are slightly # different from the distributed setting because within a single unroll # of an environment, the actions may be computed using different weights # if an update happens within the unroll. old_build = agent._build @dynamic_batching.batch_fn def build(*args): with tf.device('/gpu'): return old_build(*args) tf.logging.info('Using dynamic batching.') agent._build = build # Build actors and ops to enqueue their output. enqueue_ops = [] for i in range(FLAGS.num_actors): if is_actor_fn(i): level_name = level_names[i % len(level_names)] tf.logging.info('Creating actor %d with level %s', i, level_name) env = create_environment(level_name, seed=i + 1) actor_output = build_actor(agent, env, level_name, action_set) with tf.device(shared_job_device): enqueue_ops.append(queue.enqueue(nest.flatten(actor_output))) # If running in a single machine setup, run actors with QueueRunners # (separate threads). if is_learner and enqueue_ops: tf.train.add_queue_runner(tf.train.QueueRunner(queue, enqueue_ops)) # Build learner. if is_learner: # Create global step, which is the number of environment frames processed. tf.get_variable( 'num_environment_frames', initializer=tf.zeros_initializer(), shape=[], dtype=tf.int64, trainable=False, collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES]) # Create batch (time major) and recreate structure. dequeued = queue.dequeue_many(FLAGS.batch_size) dequeued = nest.pack_sequence_as(structure, dequeued) def make_time_major(s): return nest.map_structure( lambda t: tf.transpose(t, [1, 0] + list(range(t.shape.ndims))[2:]), s) dequeued = dequeued._replace( env_outputs=make_time_major(dequeued.env_outputs), agent_outputs=make_time_major(dequeued.agent_outputs)) with tf.device('/gpu'): # Using StagingArea allows us to prepare the next batch and send it to # the GPU while we're performing a training step. This adds up to 1 step # policy lag. flattened_output = nest.flatten(dequeued) area = tf.contrib.staging.StagingArea( [t.dtype for t in flattened_output], [t.shape for t in flattened_output]) stage_op = area.put(flattened_output) data_from_actors = nest.pack_sequence_as(structure, area.get()) # Unroll agent on sequence, create losses and update ops. output = build_learner(agent, data_from_actors.agent_state, data_from_actors.env_outputs, data_from_actors.agent_outputs) # Create MonitoredSession (to run the graph, checkpoint and log). tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner) config = tf.ConfigProto(allow_soft_placement=True, device_filters=filters) with tf.train.MonitoredTrainingSession( server.target, is_chief=is_learner, checkpoint_dir=FLAGS.logdir, save_checkpoint_secs=600, save_summaries_secs=30, log_step_count_steps=50000, config=config, hooks=[py_process.PyProcessHook()]) as session: if is_learner: # Logging. level_returns = {level_name: [] for level_name in level_names} summary_writer = tf.summary.FileWriterCache.get(FLAGS.logdir) # Prepare data for first run. session.run_step_fn( lambda step_context: step_context.session.run(stage_op)) # Execute learning and track performance. num_env_frames_v = 0 while num_env_frames_v < FLAGS.total_environment_frames: level_names_v, done_v, infos_v, num_env_frames_v, _ = session.run( (data_from_actors.level_name,) + output + (stage_op,)) level_names_v = np.repeat([level_names_v], done_v.shape[0], 0) for level_name, episode_return, episode_step in zip( level_names_v[done_v], infos_v.episode_return[done_v], infos_v.episode_step[done_v]): episode_frames = episode_step * FLAGS.num_action_repeats tf.logging.info('Level: %s Episode return: %f', level_name, episode_return) summary = tf.summary.Summary() summary.value.add(tag=level_name + '/episode_return', simple_value=episode_return) summary.value.add(tag=level_name + '/episode_frames', simple_value=episode_frames) summary_writer.add_summary(summary, num_env_frames_v) if FLAGS.level_name == 'dmlab30': level_returns[level_name].append(episode_return) if (FLAGS.level_name == 'dmlab30' and min(map(len, level_returns.values())) >= 1): no_cap = dmlab30.compute_human_normalized_score(level_returns, per_level_cap=None) cap_100 = dmlab30.compute_human_normalized_score(level_returns, per_level_cap=100) summary = tf.summary.Summary() summary.value.add( tag='dmlab30/training_no_cap', simple_value=no_cap) summary.value.add( tag='dmlab30/training_cap_100', simple_value=cap_100) summary_writer.add_summary(summary, num_env_frames_v) # Clear level scores. level_returns = {level_name: [] for level_name in level_names} else: # Execute actors (they just need to enqueue their output). while True: session.run(enqueue_ops)