def __init__(self, batch_env): """Batch of environments inside the TensorFlow graph. Args: batch_env: Batch environment. """ self._batch_env = batch_env observ_shape = self._parse_shape(self._batch_env.observation_space) observ_dtype = self._parse_dtype(self._batch_env.observation_space) action_shape = self._parse_shape(self._batch_env.action_space) action_dtype = self._parse_dtype(self._batch_env.action_space) with tf.variable_scope('env_temporary'): self._observ = tf.Variable(tf.zeros( (len(self._batch_env), ) + observ_shape, observ_dtype), name='observ', trainable=False) self._action = tf.Variable(tf.zeros( (len(self._batch_env), ) + action_shape, action_dtype), name='action', trainable=False) self._reward = tf.Variable(tf.zeros((len(self._batch_env), ), tf.float32), name='reward', trainable=False) self._done = tf.Variable(tf.cast(tf.ones((len(self._batch_env), )), tf.bool), name='done', trainable=False)
def test_trainable_and_non_trainable(self): tf.Variable(tf.zeros((5, 3)), trainable=True) tf.Variable(tf.zeros((8, 2)), trainable=False) tf.Variable(tf.zeros((1, 1)), trainable=True) tf.Variable(tf.zeros((5, )), trainable=True) tf.Variable(tf.zeros((3, 1)), trainable=False) self.assertEqual(15 + 1 + 5, count_weights())
def test_non_default_graph(self): graph = tf.Graph() with graph.as_default(): tf.Variable(tf.zeros((5, 3)), trainable=True) tf.Variable(tf.zeros((8, 2)), trainable=False) self.assertNotEqual(graph, tf.get_default_graph) self.assertEqual(15, count_weights(graph=graph))
def test_restrict_invalid_scope(self): tf.Variable(tf.zeros((3, 2)), trainable=True) with tf.variable_scope('foo'): tf.Variable(tf.zeros((5, 2)), trainable=True) with tf.variable_scope('bar'): tf.Variable(tf.zeros((1, 2)), trainable=True) self.assertEqual(0, count_weights('bar'))
def __init__(self, template, center=True, scale=True, clip=10, name='normalize'): """Normalize tensors based on streaming estimates of mean and variance. Centering the value, scaling it by the standard deviation, and clipping outlier values are optional. Args: template: Example tensor providing shape and dtype of the vaule to track. center: Python boolean indicating whether to subtract mean from values. scale: Python boolean indicating whether to scale values by stddev. clip: If and when to clip normalized values. name: Parent scope of operations provided by this class. """ self._center = center self._scale = scale self._clip = clip self._name = name with tf.name_scope(name): self._count = tf.Variable(0, False) self._mean = tf.Variable(tf.zeros_like(template), False) self._var_sum = tf.Variable(tf.zeros_like(template), False)
def __init__(self, env): """Put an OpenAI Gym environment into the TensorFlow graph. Args: env: OpenAI Gym environment. """ self._env = env observ_shape = self._parse_shape(self._env.observation_space) observ_dtype = self._parse_dtype(self._env.observation_space) action_shape = self._parse_shape(self._env.action_space) action_dtype = self._parse_dtype(self._env.action_space) with tf.name_scope('environment'): self._observ = tf.Variable(tf.zeros(observ_shape, observ_dtype), name='observ', trainable=False) self._action = tf.Variable(tf.zeros(action_shape, action_dtype), name='action', trainable=False) self._reward = tf.Variable(0.0, dtype=tf.float32, name='reward', trainable=False) self._done = tf.Variable(True, dtype=tf.bool, name='done', trainable=False) self._step = tf.Variable(0, dtype=tf.int32, name='step', trainable=False)
def test_exclude_by_regex(self): tf.Variable(tf.zeros((3, 2)), trainable=True) with tf.variable_scope('foo'): tf.Variable(tf.zeros((5, 2)), trainable=True) with tf.variable_scope('bar'): tf.Variable(tf.zeros((1, 2)), trainable=True) self.assertEqual(0, count_weights(exclude=r'.*')) self.assertEqual(6, count_weights(exclude=r'(^|/)foo/.*')) self.assertEqual(16, count_weights(exclude=r'.*/bar/.*'))
def __init__(self, shape, dtype): """Specify the shape and dtype of the mean to be estimated. Note that a float mean to zero submitted elements is NaN, while computing the integer mean of zero elements raises a division by zero error. Args: shape: Shape of the mean to compute. dtype: Data type of the mean to compute. """ self._dtype = dtype self._sum = tf.Variable(lambda: tf.zeros(shape, dtype), False) self._count = tf.Variable(lambda: 0, trainable=False)
def _restore_policy(self, network, policy_layers, value_layers, action_size, checkpoint): """Restore the PPO policy from a TensorFlow checkpoint. Args: network: The neural network definition. policy_layers: A tuple specify the number of layers and number of neurons of each layer for the policy network. value_layers: A tuple specify the number of layers and number of neurons of each layer for the value network. action_size: The dimension of the action space. checkpoint: The checkpoint path. """ observ = self._observ_filter.transform(self.observation_placeholder) with tf.variable_scope("network/rnn"): self.network = network(policy_layers=policy_layers, value_layers=value_layers, action_size=action_size) with tf.variable_scope("temporary"): self.last_state = tf.Variable(self.network.zero_state(1, tf.float32), False) self.sess.run(self.last_state.initializer) with tf.variable_scope("network"): (mean_action, _, _), new_state = tf.nn.dynamic_rnn(self.network, observ[:, None], tf.ones(1), self.last_state, tf.float32, swap_memory=True) self.mean_action = mean_action self.update_state = self.last_state.assign(new_state) saver = utility.define_saver(exclude=(r"temporary/.*",)) saver.restore(self.sess, checkpoint)
def create_nested_vars(tensors): """Create variables matching a nested tuple of tensors. Args: tensors: Nested tuple of list of tensors. Returns: Nested tuple or list of variables. """ if isinstance(tensors, (tuple, list)): return type(tensors)(create_nested_vars(tensor) for tensor in tensors) return tf.Variable(tensors, False)
def __init__(self, template, capacity, max_length, scope): """Create a memory that stores episodes. Each transition tuple consists of quantities specified by the template. These quantities would typically be be observartions, actions, rewards, and done indicators. Args: template: List of tensors to derive shapes and dtypes of each transition. capacity: Number of episodes, or rows, hold by the memory. max_length: Allocated sequence length for the episodes. scope: Variable scope to use for internal variables. """ self._capacity = capacity self._max_length = max_length with tf.variable_scope(scope) as var_scope: self._scope = var_scope self._length = tf.Variable(tf.zeros(capacity, tf.int32), False) self._buffers = [ tf.Variable( tf.zeros([capacity, max_length] + elem.shape.as_list(), elem.dtype), False) for elem in template ]
def test_not_done(self): step = tf.Variable(0, False, dtype=tf.int32, name='step') done = tf.equal((step + 1) % 2, 0) score = tf.cast(step, tf.float32) loop = tools.Loop(None, step) loop.add_phase('phase_1', done, score, summary='', steps=1, report_every=3) # Score: 0 1 2 3 4 5 6 7 8 # Done: x x x x # Report: x x x with self.test_session() as sess: sess.run(tf.global_variables_initializer()) scores = list(loop.run(sess, saver=None, max_step=9)) self.assertAllEqual([1, 4, 7], scores)
def test_report_every_step(self): step = tf.Variable(0, False, dtype=tf.int32, name='step') loop = tools.Loop(None, step) loop.add_phase('phase_1', done=True, score=0, summary='', steps=1, report_every=3) # Step: 0 1 2 3 4 5 6 7 8 # Report: x x x with self.test_session() as sess: sess.run(tf.global_variables_initializer()) scores = loop.run(sess, saver=None, max_step=9) next(scores) self.assertEqual(3, sess.run(step)) next(scores) self.assertEqual(6, sess.run(step)) next(scores) self.assertEqual(9, sess.run(step))
def __init__(self, logdir, step=None, log=None, report=None, reset=None): """Execute operations in a loop and coordinate logging and checkpoints. The step, log, report, and report arguments will get created if not provided. Reset is used to indicate switching to a new phase, so that the model can start a new computation in case its computation is split over multiple training steps. Args: logdir: Will contain checkpoints and summaries for each phase. step: Variable of the global step (optional). log: Tensor indicating to the model to compute summary tensors. report: Tensor indicating to the loop to report the current mean score. reset: Tensor indicating to the model to start a new computation. """ self._logdir = logdir self._step = (tf.Variable(0, False, name='global_step') if step is None else step) self._log = tf.placeholder(tf.bool) if log is None else log self._report = tf.placeholder(tf.bool) if report is None else report self._reset = tf.placeholder(tf.bool) if reset is None else reset self._phases = []
def test_not_done_batch(self): step = tf.Variable(0, False, dtype=tf.int32, name='step') done = tf.equal([step % 3, step % 4], 0) score = tf.cast([step, step**2], tf.float32) loop = tools.Loop(None, step) loop.add_phase('phase_1', done, score, summary='', steps=1, report_every=8) # Step: 0 2 4 6 # Score 1: 0 2 4 6 # Done 1: x x # Score 2: 0 4 16 32 # Done 2: x x with self.test_session() as sess: sess.run(tf.global_variables_initializer()) scores = list(loop.run(sess, saver=None, max_step=8)) self.assertEqual(8, sess.run(step)) self.assertAllEqual([(0 + 0 + 16 + 6) / 4], scores)
def define_simulation_graph(batch_env, algo_cls, config): """Define the algortihm and environment interaction. Args: batch_env: In-graph environments object. algo_cls: Constructor of a batch algorithm. config: Configuration object for the algorithm. Returns: Object providing graph elements via attributes. """ # pylint: disable=unused-variable step = tf.Variable(0, False, dtype=tf.int32, name='global_step') is_training = tf.placeholder(tf.bool, name='is_training') should_log = tf.placeholder(tf.bool, name='should_log') do_report = tf.placeholder(tf.bool, name='do_report') force_reset = tf.placeholder(tf.bool, name='force_reset') algo = algo_cls(batch_env, step, is_training, should_log, config) done, score, summary = tools.simulate(batch_env, algo, should_log, force_reset) message = 'Graph contains {} trainable variables.' tf.logging.info(message.format(tools.count_weights())) # pylint: enable=unused-variable return tools.AttrDict(locals())
def simulate(batch_env, algo, log=True, reset=False): """Simulation step of a vecrotized algorithm with in-graph environments. Integrates the operations implemented by the algorithm and the environments into a combined operation. Args: batch_env: In-graph batch environment. algo: Algorithm instance implementing required operations. log: Tensor indicating whether to compute and return summaries. reset: Tensor causing all environments to reset. Returns: Tuple of tensors containing done flags for the current episodes, possibly intermediate scores for the episodes, and a summary tensor. """ def _define_begin_episode(agent_indices): """Reset environments, intermediate scores and durations for new episodes. Args: agent_indices: Tensor containing batch indices starting an episode. Returns: Summary tensor. """ assert agent_indices.shape.ndims == 1 zero_scores = tf.zeros_like(agent_indices, tf.float32) zero_durations = tf.zeros_like(agent_indices) reset_ops = [ batch_env.reset(agent_indices), tf.scatter_update(score, agent_indices, zero_scores), tf.scatter_update(length, agent_indices, zero_durations) ] with tf.control_dependencies(reset_ops): return algo.begin_episode(agent_indices) def _define_step(): """Request actions from the algorithm and apply them to the environments. Increments the lengths of all episodes and increases their scores by the current reward. After stepping the environments, provides the full transition tuple to the algorithm. Returns: Summary tensor. """ prevob = batch_env.observ + 0 # Ensure a copy of the variable value. action, step_summary = algo.perform(prevob) action.set_shape(batch_env.action.shape) with tf.control_dependencies([batch_env.simulate(action)]): add_score = score.assign_add(batch_env.reward) inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32)) with tf.control_dependencies([add_score, inc_length]): experience_summary = algo.experience(prevob, batch_env.action, batch_env.reward, batch_env.done, batch_env.observ) return tf.summary.merge([step_summary, experience_summary]) def _define_end_episode(agent_indices): """Notify the algorithm of ending episodes. Also updates the mean score and length counters used for summaries. Args: agent_indices: Tensor holding batch indices that end their episodes. Returns: Summary tensor. """ assert agent_indices.shape.ndims == 1 submit_score = mean_score.submit(tf.gather(score, agent_indices)) submit_length = mean_length.submit( tf.cast(tf.gather(length, agent_indices), tf.float32)) with tf.control_dependencies([submit_score, submit_length]): return algo.end_episode(agent_indices) def _define_summaries(): """Reset the average score and duration, and return them as summary. Returns: Summary string. """ score_summary = tf.cond( tf.logical_and(log, tf.cast(mean_score.count, tf.bool)), lambda: tf.summary.scalar('mean_score', mean_score.clear()), str) length_summary = tf.cond( tf.logical_and(log, tf.cast(mean_length.count, tf.bool)), lambda: tf.summary.scalar('mean_length', mean_length.clear()), str) return tf.summary.merge([score_summary, length_summary]) with tf.name_scope('simulate'): log = tf.convert_to_tensor(log) reset = tf.convert_to_tensor(reset) with tf.variable_scope('simulate_temporary'): score = tf.Variable(tf.zeros(len(batch_env), dtype=tf.float32), False, name='score') length = tf.Variable(tf.zeros(len(batch_env), dtype=tf.int32), False, name='length') mean_score = streaming_mean.StreamingMean((), tf.float32) mean_length = streaming_mean.StreamingMean((), tf.float32) agent_indices = tf.cond( reset, lambda: tf.range(len(batch_env)), lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)) begin_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool), lambda: _define_begin_episode(agent_indices), str) with tf.control_dependencies([begin_episode]): step = _define_step() with tf.control_dependencies([step]): agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32) end_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool), lambda: _define_end_episode(agent_indices), str) with tf.control_dependencies([end_episode]): summary = tf.summary.merge( [_define_summaries(), begin_episode, step, end_episode]) with tf.control_dependencies([summary]): done, score = tf.identity(batch_env.done), tf.identity(score) return done, score, summary
def __init__(self, batch_env, step, is_training, should_log, config): """Create an instance of the PPO algorithm. Args: batch_env: In-graph batch environment. step: Integer tensor holding the current training step. is_training: Boolean tensor for whether the algorithm should train. should_log: Boolean tensor for whether summaries should be returned. config: Object containing the agents configuration as attributes. """ self._batch_env = batch_env self._step = step self._is_training = is_training self._should_log = should_log self._config = config self._observ_filter = normalize.StreamingNormalize( self._batch_env.observ[0], center=True, scale=True, clip=5, name='normalize_observ') self._reward_filter = normalize.StreamingNormalize( self._batch_env.reward[0], center=False, scale=True, clip=10, name='normalize_reward') # Memory stores tuple of observ, action, mean, logstd, reward. template = (self._batch_env.observ[0], self._batch_env.action[0], self._batch_env.action[0], self._batch_env.action[0], self._batch_env.reward[0]) self._memory = memory.EpisodeMemory(template, config.update_every, config.max_length, 'memory') self._memory_index = tf.Variable(0, False) use_gpu = self._config.use_gpu and utility.available_gpus() with tf.device('/gpu:0' if use_gpu else '/cpu:0'): # Create network variables for later calls to reuse. self._network(tf.zeros_like(self._batch_env.observ)[:, None], tf.ones(len(self._batch_env)), reuse=None) cell = self._config.network(self._batch_env.action.shape[1].value) with tf.variable_scope('ppo_temporary'): self._episodes = memory.EpisodeMemory(template, len(batch_env), config.max_length, 'episodes') self._last_state = utility.create_nested_vars( cell.zero_state(len(batch_env), tf.float32)) self._last_action = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_action') self._last_mean = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_mean') self._last_logstd = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_logstd') self._penalty = tf.Variable(self._config.kl_init_penalty, False, dtype=tf.float32) self._policy_optimizer = self._config.policy_optimizer( self._config.policy_lr, name='policy_optimizer') self._value_optimizer = self._config.value_optimizer( self._config.value_lr, name='value_optimizer')
def __init__(self, batch_env, step, is_training, should_log, config): """Create an instance of the PPO algorithm. Args: batch_env: In-graph batch environment. step: Integer tensor holding the current training step. is_training: Boolean tensor for whether the algorithm should train. should_log: Boolean tensor for whether summaries should be returned. config: Object containing the agent configuration as attributes. """ self._batch_env = batch_env self._step = step self._is_training = is_training self._should_log = should_log self._config = config self._observ_filter = normalize.StreamingNormalize( self._batch_env.observ[0], center=True, scale=True, clip=5, name='normalize_observ') self._reward_filter = normalize.StreamingNormalize( self._batch_env.reward[0], center=False, scale=True, clip=10, name='normalize_reward') # Memory stores tuple of observ, action, mean, logstd, reward. template = (self._batch_env.observ[0], self._batch_env.action[0], self._batch_env.action[0], self._batch_env.action[0], self._batch_env.reward[0]) self._memory = memory.EpisodeMemory(template, config.update_every, config.max_length, 'memory') self._memory_index = tf.Variable(0, False) use_gpu = self._config.use_gpu and utility.available_gpus() with tf.device('/gpu:0' if use_gpu else '/cpu:0'): # Create network variables for later calls to reuse. action_size = self._batch_env.action.shape[1].value self._network = tf.make_template( 'network', functools.partial(config.network, config, action_size)) output = self._network( tf.zeros_like(self._batch_env.observ)[:, None], tf.ones(len(self._batch_env))) with tf.variable_scope('ppo_temporary'): self._episodes = memory.EpisodeMemory(template, len(batch_env), config.max_length, 'episodes') if output.state is None: self._last_state = None else: # Ensure the batch dimension is set. tf.contrib.framework.nest.map_structure( lambda x: x.set_shape([len(batch_env)] + x.shape. as_list()[1:]), output.state) # pylint: disable=undefined-variable self._last_state = tf.contrib.framework.nest.map_structure( lambda x: tf.Variable(lambda: tf.zeros_like(x), False), output.state) self._last_action = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_action') self._last_mean = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_mean') self._last_logstd = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_logstd') self._penalty = tf.Variable(self._config.kl_init_penalty, False, dtype=tf.float32) self._optimizer = self._config.optimizer(self._config.learning_rate)
def test_count_trainable(self): tf.Variable(tf.zeros((5, 3)), trainable=True) tf.Variable(tf.zeros((1, 1)), trainable=True) tf.Variable(tf.zeros((5, )), trainable=True) self.assertEqual(15 + 1 + 5, count_weights())
def test_ignore_non_trainable(self): tf.Variable(tf.zeros((5, 3)), trainable=False) tf.Variable(tf.zeros((1, 1)), trainable=False) tf.Variable(tf.zeros((5, )), trainable=False) self.assertEqual(0, count_weights())
def test_include_scopes(self): tf.Variable(tf.zeros((3, 2)), trainable=True) with tf.variable_scope('foo'): tf.Variable(tf.zeros((5, 2)), trainable=True) self.assertEqual(6 + 10, count_weights())