def _define_step(): """Request actions from the algorithm and apply them to the environments. Increments the lengths of all episodes and increases their scores by the current reward. After stepping the environments, provides the full transition tuple to the algorithm. Returns: Summary tensor. """ prevob = batch_env.observ + 0 # Ensure a copy of the variable value. agent_indices = tf.range(len(batch_env)) action, step_summary = algo.perform(agent_indices, prevob) action.set_shape(batch_env.action.shape) with tf.control_dependencies([batch_env.simulate(action)]): add_score = score.assign_add(batch_env.reward) inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32)) with tf.control_dependencies([add_score, inc_length]): agent_indices = tf.range(len(batch_env)) experience_summary = algo.experience(agent_indices, prevob, batch_env.action, batch_env.reward, batch_env.done, batch_env.observ) return tf.summary.merge([step_summary, experience_summary])
def _restore_policy(self, network, policy_layers, value_layers, action_size, checkpoint): """Restore the PPO policy from a TensorFlow checkpoint. Args: network: The neural network definition. policy_layers: A tuple specify the number of layers and number of neurons of each layer for the policy network. value_layers: A tuple specify the number of layers and number of neurons of each layer for the value network. action_size: The dimension of the action space. checkpoint: The checkpoint path. """ observ = self._observ_filter.transform(self.observation_placeholder) with tf.variable_scope("network/rnn"): self.network = network(policy_layers=policy_layers, value_layers=value_layers, action_size=action_size) with tf.variable_scope("temporary"): self.last_state = tf.Variable(self.network.zero_state(1, tf.float32), False) self.sess.run(self.last_state.initializer) with tf.variable_scope("network"): (mean_action, _, _), new_state = tf.nn.dynamic_rnn(self.network, observ[:, None], tf.ones(1), self.last_state, tf.float32, swap_memory=True) self.mean_action = mean_action self.update_state = self.last_state.assign(new_state) saver = utility.define_saver(exclude=(r"temporary/.*",)) saver.restore(self.sess, checkpoint)
def perform(self, observ): """Compute batch of actions and a summary for a batch of observation. Args: observ: Tensor of a batch of observations for all algorithms. Returns: Tuple of action batch tensor and summary tensor. """ with tf.name_scope('perform/'): observ = self._observ_filter.transform(observ) network = self._network(observ[:, None], tf.ones(observ.shape[0]), self._last_state) action = tf.cond(self._is_training, network.policy.sample, lambda: network.mean) logprob = network.policy.log_prob(action)[:, 0] # pylint: disable=g-long-lambda summary = tf.cond( self._should_log, lambda: tf.summary.merge([ tf.summary.histogram('mean', network.mean[:, 0]), tf.summary.histogram('std', tf.exp(network.logstd[:, 0])), tf.summary.histogram('action', action[:, 0]), tf.summary.histogram('logprob', logprob) ]), str) # Remember current policy to append to memory in the experience callback. with tf.control_dependencies([ utility.assign_nested_vars(self._last_state, network.state), self._last_action.assign(action[:, 0]), self._last_mean.assign(network.mean[:, 0]), self._last_logstd.assign(network.logstd[:, 0]) ]): return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
def __init__(self, batch_env): """Batch of environments inside the TensorFlow graph. Args: batch_env: Batch environment. """ self._batch_env = batch_env observ_shape = self._parse_shape(self._batch_env.observation_space) observ_dtype = self._parse_dtype(self._batch_env.observation_space) action_shape = self._parse_shape(self._batch_env.action_space) action_dtype = self._parse_dtype(self._batch_env.action_space) with tf.variable_scope('env_temporary'): self._observ = tf.Variable(tf.zeros( (len(self._batch_env), ) + observ_shape, observ_dtype), name='observ', trainable=False) self._action = tf.Variable(tf.zeros( (len(self._batch_env), ) + action_shape, action_dtype), name='action', trainable=False) self._reward = tf.Variable(tf.zeros((len(self._batch_env), ), tf.float32), name='reward', trainable=False) self._done = tf.Variable(tf.cast(tf.ones((len(self._batch_env), )), tf.bool), name='done', trainable=False)
def _build_nets(self, json_data): assert self.ACTOR_NET_KEY in json_data assert self.CRITIC_NET_KEY in json_data actor_net_name = json_data[self.ACTOR_NET_KEY] critic_net_name = json_data[self.CRITIC_NET_KEY] actor_init_output_scale = 1 if ( self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY] s_size = self.get_state_size() g_size = self.get_goal_size() a_size = self.get_action_size() # setup input tensors self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s") self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a") self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val") self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv") self.g_tf = tf.placeholder( tf.float32, shape=([None, g_size] if self.has_goal() else None), name="g") self.old_logp_tf = tf.placeholder(tf.float32, shape=[None], name="old_logp") self.exp_mask_tf = tf.placeholder(tf.float32, shape=[None], name="exp_mask") with tf.variable_scope('main'): with tf.variable_scope('actor'): self.a_mean_tf = self._build_net_actor( actor_net_name, actor_init_output_scale) with tf.variable_scope('critic'): self.critic_tf = self._build_net_critic(critic_net_name) if (self.a_mean_tf != None): Logger.print2('Built actor net: ' + actor_net_name) if (self.critic_tf != None): Logger.print2('Built critic net: ' + critic_net_name) self.norm_a_std_tf = self.exp_params_curr.noise * tf.ones(a_size) norm_a_noise_tf = self.norm_a_std_tf * tf.random_normal( shape=tf.shape(self.a_mean_tf)) norm_a_noise_tf *= tf.expand_dims(self.exp_mask_tf, axis=-1) self.sample_a_tf = self.a_mean_tf + norm_a_noise_tf * self.a_norm.std_tf self.sample_a_logp_tf = TFUtil.calc_logp_gaussian( x_tf=norm_a_noise_tf, mean_tf=None, std_tf=self.norm_a_std_tf) return
def perform(self, agent_indices, observ): """Compute batch of actions and a summary for a batch of observation. Args: agent_indices: Tensor containing current batch indices. observ: Tensor of a batch of observations for all agents. Returns: Tuple of action batch tensor and summary tensor. """ with tf.name_scope('perform/'): observ = self._observ_filter.transform(observ) if self._last_state is None: state = None else: state = tf.contrib.framework.nest.map_structure( lambda x: tf.gather(x, agent_indices), self._last_state) output = self._network(observ[:, None], tf.ones(observ.shape[0]), state) action = tf.cond(self._is_training, output.policy.sample, lambda: output.mean) logprob = output.policy.log_prob(action)[:, 0] # pylint: disable=g-long-lambda summary = tf.cond( self._should_log, lambda: tf.summary.merge([ tf.summary.histogram('mean', output.mean[:, 0]), tf.summary.histogram('std', tf.exp(output.logstd[:, 0])), tf.summary.histogram('action', action[:, 0]), tf.summary.histogram('logprob', logprob) ]), str) # Remember current policy to append to memory in the experience callback. if self._last_state is None: assign_state = tf.no_op() else: assign_state = utility.assign_nested_vars( self._last_state, output.state, agent_indices) with tf.control_dependencies([ assign_state, tf.scatter_update(self._last_action, agent_indices, action[:, 0]), tf.scatter_update(self._last_mean, agent_indices, output.mean[:, 0]), tf.scatter_update(self._last_logstd, agent_indices, output.logstd[:, 0]) ]): return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
def __init__(self, batch_env, step, is_training, should_log, config): """Create an instance of the PPO algorithm. Args: batch_env: In-graph batch environment. step: Integer tensor holding the current training step. is_training: Boolean tensor for whether the algorithm should train. should_log: Boolean tensor for whether summaries should be returned. config: Object containing the agent configuration as attributes. """ self._batch_env = batch_env self._step = step self._is_training = is_training self._should_log = should_log self._config = config self._observ_filter = normalize.StreamingNormalize( self._batch_env.observ[0], center=True, scale=True, clip=5, name='normalize_observ') self._reward_filter = normalize.StreamingNormalize( self._batch_env.reward[0], center=False, scale=True, clip=10, name='normalize_reward') # Memory stores tuple of observ, action, mean, logstd, reward. template = (self._batch_env.observ[0], self._batch_env.action[0], self._batch_env.action[0], self._batch_env.action[0], self._batch_env.reward[0]) self._memory = memory.EpisodeMemory(template, config.update_every, config.max_length, 'memory') self._memory_index = tf.Variable(0, False) use_gpu = self._config.use_gpu and utility.available_gpus() with tf.device('/gpu:0' if use_gpu else '/cpu:0'): # Create network variables for later calls to reuse. action_size = self._batch_env.action.shape[1].value self._network = tf.make_template( 'network', functools.partial(config.network, config, action_size)) output = self._network( tf.zeros_like(self._batch_env.observ)[:, None], tf.ones(len(self._batch_env))) with tf.variable_scope('ppo_temporary'): self._episodes = memory.EpisodeMemory(template, len(batch_env), config.max_length, 'episodes') if output.state is None: self._last_state = None else: # Ensure the batch dimension is set. tf.contrib.framework.nest.map_structure( lambda x: x.set_shape([len(batch_env)] + x.shape. as_list()[1:]), output.state) # pylint: disable=undefined-variable self._last_state = tf.contrib.framework.nest.map_structure( lambda x: tf.Variable(lambda: tf.zeros_like(x), False), output.state) self._last_action = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_action') self._last_mean = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_mean') self._last_logstd = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_logstd') self._penalty = tf.Variable(self._config.kl_init_penalty, False, dtype=tf.float32) self._optimizer = self._config.optimizer(self._config.learning_rate)
def __init__(self, batch_env, step, is_training, should_log, config): """Create an instance of the PPO algorithm. Args: batch_env: In-graph batch environment. step: Integer tensor holding the current training step. is_training: Boolean tensor for whether the algorithm should train. should_log: Boolean tensor for whether summaries should be returned. config: Object containing the agents configuration as attributes. """ self._batch_env = batch_env self._step = step self._is_training = is_training self._should_log = should_log self._config = config self._observ_filter = normalize.StreamingNormalize( self._batch_env.observ[0], center=True, scale=True, clip=5, name='normalize_observ') self._reward_filter = normalize.StreamingNormalize( self._batch_env.reward[0], center=False, scale=True, clip=10, name='normalize_reward') # Memory stores tuple of observ, action, mean, logstd, reward. template = (self._batch_env.observ[0], self._batch_env.action[0], self._batch_env.action[0], self._batch_env.action[0], self._batch_env.reward[0]) self._memory = memory.EpisodeMemory(template, config.update_every, config.max_length, 'memory') self._memory_index = tf.Variable(0, False) use_gpu = self._config.use_gpu and utility.available_gpus() with tf.device('/gpu:0' if use_gpu else '/cpu:0'): # Create network variables for later calls to reuse. self._network(tf.zeros_like(self._batch_env.observ)[:, None], tf.ones(len(self._batch_env)), reuse=None) cell = self._config.network(self._batch_env.action.shape[1].value) with tf.variable_scope('ppo_temporary'): self._episodes = memory.EpisodeMemory(template, len(batch_env), config.max_length, 'episodes') self._last_state = utility.create_nested_vars( cell.zero_state(len(batch_env), tf.float32)) self._last_action = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_action') self._last_mean = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_mean') self._last_logstd = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_logstd') self._penalty = tf.Variable(self._config.kl_init_penalty, False, dtype=tf.float32) self._policy_optimizer = self._config.policy_optimizer( self._config.policy_lr, name='policy_optimizer') self._value_optimizer = self._config.value_optimizer( self._config.value_lr, name='value_optimizer')