def simulate(self, action): """Step the environment. The result of the step can be accessed from the variables defined below. Args: action: Tensor holding the action to apply. Returns: Operation. """ with tf.name_scope('environment/simulate'): if action.dtype in (tf.float16, tf.float32, tf.float64): action = tf.check_numerics(action, 'action') observ_dtype = self._parse_dtype(self._env.observation_space) observ, reward, done = tf.py_func( lambda a: self._env.step(a)[:3], [action], [observ_dtype, tf.float32, tf.bool], name='step') observ = tf.check_numerics(observ, 'observ') reward = tf.check_numerics(reward, 'reward') return tf.group(self._observ.assign(observ), self._action.assign(action), self._reward.assign(reward), self._done.assign(done), self._step.assign_add(1))
def recurrent_gaussian(config, action_size, observations, length, state=None): """Independent recurrent policy and feed forward value networks. The policy network outputs the mean action and the log standard deviation is learned as independent parameter vector. The last policy layer is recurrent and uses a GRU cell. Args: config: Configuration object. action_size: Length of the action vector. observations: Sequences of observations. length: Batch of sequence lengths. state: Batch of initial recurrent states. Returns: NetworkOutput tuple. """ mean_weights_initializer = tf.contrib.layers.variance_scaling_initializer( factor=config.init_mean_factor) logstd_initializer = tf.random_normal_initializer(config.init_logstd, 1e-10) cell = tf.contrib.rnn.GRUBlockCell(config.policy_layers[-1]) flat_observations = tf.reshape(observations, [ tf.shape(observations)[0], tf.shape(observations)[1], functools.reduce(operator.mul, observations.shape.as_list()[2:], 1) ]) with tf.variable_scope('policy'): x = flat_observations for size in config.policy_layers[:-1]: x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) x, state = tf.nn.dynamic_rnn(cell, x, length, state, tf.float32) mean = tf.contrib.layers.fully_connected(x, action_size, tf.tanh, weights_initializer=mean_weights_initializer) logstd = tf.get_variable('logstd', mean.shape[2:], tf.float32, logstd_initializer) logstd = tf.tile(logstd[None, None], [tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2)) with tf.variable_scope('value'): x = flat_observations for size in config.value_layers: x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0] mean = tf.check_numerics(mean, 'mean') logstd = tf.check_numerics(logstd, 'logstd') value = tf.check_numerics(value, 'value') policy = tf.contrib.distributions.MultivariateNormalDiag(mean, tf.exp(logstd)) # assert state.shape.as_list()[0] is not None return NetworkOutput(policy, mean, logstd, value, state)
def transform(self, value): """Normalize a single or batch tensor. Applies the activated transformations in the constructor using current estimates of mean and variance. Args: value: Batch or single value tensor. Returns: Normalized batch or single value tensor. """ with tf.name_scope(self._name + '/transform'): no_batch_dim = value.shape.ndims == self._mean.shape.ndims if no_batch_dim: # Add a batch dimension if necessary. value = value[None, ...] if self._center: value -= self._mean[None, ...] if self._scale: # We cannot scale before seeing at least two samples. value /= tf.cond(self._count > 1, lambda: self._std() + 1e-8, lambda: tf.ones_like(self._var_sum))[None] if self._clip: value = tf.clip_by_value(value, -self._clip, self._clip) # Remove batch dimension if necessary. if no_batch_dim: value = value[0] return tf.check_numerics(value, 'value')
def _value_loss(self, observ, reward, length): """Compute the loss function for the value baseline. The value loss is the difference between empirical and approximated returns over the collected episodes. Returns the loss tensor and a summary strin. Args: observ: Sequences of observations. reward: Sequences of reward. length: Batch of sequence lengths. Returns: Tuple of loss tensor and summary tensor. """ with tf.name_scope('value_loss'): value = self._network(observ, length).value return_ = utility.discounted_return(reward, length, self._config.discount) advantage = return_ - value value_loss = 0.5 * self._mask(advantage**2, length) summary = tf.summary.merge([ tf.summary.histogram('value_loss', value_loss), tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss)) ]) value_loss = tf.reduce_mean(value_loss) return tf.check_numerics(value_loss, 'value_loss'), summary
def perform(self, observ): """Compute batch of actions and a summary for a batch of observation. Args: observ: Tensor of a batch of observations for all algorithms. Returns: Tuple of action batch tensor and summary tensor. """ with tf.name_scope('perform/'): observ = self._observ_filter.transform(observ) network = self._network(observ[:, None], tf.ones(observ.shape[0]), self._last_state) action = tf.cond(self._is_training, network.policy.sample, lambda: network.mean) logprob = network.policy.log_prob(action)[:, 0] # pylint: disable=g-long-lambda summary = tf.cond( self._should_log, lambda: tf.summary.merge([ tf.summary.histogram('mean', network.mean[:, 0]), tf.summary.histogram('std', tf.exp(network.logstd[:, 0])), tf.summary.histogram('action', action[:, 0]), tf.summary.histogram('logprob', logprob) ]), str) # Remember current policy to append to memory in the experience callback. with tf.control_dependencies([ utility.assign_nested_vars(self._last_state, network.state), self._last_action.assign(action[:, 0]), self._last_mean.assign(network.mean[:, 0]), self._last_logstd.assign(network.logstd[:, 0]) ]): return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
def discounted_return(reward, length, discount): """Discounted Monte-Carlo returns.""" timestep = tf.range(reward.shape[1].value) mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) return_ = tf.reverse( tf.transpose( tf.scan(lambda agg, cur: cur + discount * agg, tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]), tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1]) return tf.check_numerics(tf.stop_gradient(return_), 'return')
def fixed_step_return(reward, value, length, discount, window): """N-step discounted return.""" timestep = tf.range(reward.shape[1].value) mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) return_ = tf.zeros_like(reward) for _ in range(window): return_ += reward reward = discount * tf.concat([reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1) return_ += discount**window * tf.concat( [value[:, window:], tf.zeros_like(value[:, -window:]), 1]) return tf.check_numerics(tf.stop_gradient(mask * return_), 'return')
def lambda_advantage(reward, value, length, discount): """Generalized Advantage Estimation.""" timestep = tf.range(reward.shape[1].value) mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1) delta = reward + discount * next_value - value advantage = tf.reverse( tf.transpose( tf.scan(lambda agg, cur: cur + discount * agg, tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]), tf.zeros_like(delta[:, -1]), 1, False), [1, 0]), [1]) return tf.check_numerics(tf.stop_gradient(advantage), 'advantage')
def lambda_return(reward, value, length, discount, lambda_): """TD-lambda returns.""" timestep = tf.range(reward.shape[1].value) mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) sequence = mask * reward + discount * value * (1 - lambda_) discount = mask * discount * lambda_ sequence = tf.stack([sequence, discount], 2) return_ = tf.reverse( tf.transpose( tf.scan(lambda agg, cur: cur[0] + cur[1] * agg, tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]), tf.zeros_like(value[:, -1]), 1, False), [1, 0]), [1]) return tf.check_numerics(tf.stop_gradient(return_), 'return')
def _network(self, observ, length=None, state=None, reuse=True): """Compute the network output for a batched sequence of observations. Optionally, the initial state can be specified. The weights should be reused for all calls, except for the first one. Output is a named tuple containing the policy as a TensorFlow distribution, the policy mean and log standard deviation, the approximated state value, and the new recurrent state. Args: observ: Sequences of observations. length: Batch of sequence lengths. state: Batch of initial recurrent states. reuse: Python boolean whether to reuse previous variables. Returns: NetworkOutput tuple. """ with tf.variable_scope('network', reuse=reuse): observ = tf.convert_to_tensor(observ) use_gpu = self._config.use_gpu and utility.available_gpus() with tf.device('/gpu:0' if use_gpu else '/cpu:0'): observ = tf.check_numerics(observ, 'observ') cell = self._config.network( self._batch_env.action.shape[1].value) (mean, logstd, value), state = tf.nn.dynamic_rnn(cell, observ, length, state, tf.float32, swap_memory=True) mean = tf.check_numerics(mean, 'mean') logstd = tf.check_numerics(logstd, 'logstd') value = tf.check_numerics(value, 'value') policy = tf.contrib.distributions.MultivariateNormalDiag( mean, tf.exp(logstd)) return _NetworkOutput(policy, mean, logstd, value, state)
def reset(self): """Reset the environment. Returns: Tensor of the current observation. """ observ_dtype = self._parse_dtype(self._env.observation_space) observ = tf.py_func(self._env.reset, [], observ_dtype, name='reset') observ = tf.check_numerics(observ, 'observ') with tf.control_dependencies([ self._observ.assign(observ), self._reward.assign(0), self._done.assign(False) ]): return tf.identity(observ)
def perform(self, agent_indices, observ): """Compute batch of actions and a summary for a batch of observation. Args: agent_indices: Tensor containing current batch indices. observ: Tensor of a batch of observations for all agents. Returns: Tuple of action batch tensor and summary tensor. """ with tf.name_scope('perform/'): observ = self._observ_filter.transform(observ) if self._last_state is None: state = None else: state = tf.contrib.framework.nest.map_structure( lambda x: tf.gather(x, agent_indices), self._last_state) output = self._network(observ[:, None], tf.ones(observ.shape[0]), state) action = tf.cond(self._is_training, output.policy.sample, lambda: output.mean) logprob = output.policy.log_prob(action)[:, 0] # pylint: disable=g-long-lambda summary = tf.cond( self._should_log, lambda: tf.summary.merge([ tf.summary.histogram('mean', output.mean[:, 0]), tf.summary.histogram('std', tf.exp(output.logstd[:, 0])), tf.summary.histogram('action', action[:, 0]), tf.summary.histogram('logprob', logprob) ]), str) # Remember current policy to append to memory in the experience callback. if self._last_state is None: assign_state = tf.no_op() else: assign_state = utility.assign_nested_vars( self._last_state, output.state, agent_indices) with tf.control_dependencies([ assign_state, tf.scatter_update(self._last_action, agent_indices, action[:, 0]), tf.scatter_update(self._last_mean, agent_indices, output.mean[:, 0]), tf.scatter_update(self._last_logstd, agent_indices, output.logstd[:, 0]) ]): return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
def _mask(self, tensor, length): """Set padding elements of a batch of sequences to zero. Useful to then safely sum along the time dimension. Args: tensor: Tensor of sequences. length: Batch of sequence lengths. Returns: Masked sequences. """ with tf.name_scope('mask'): range_ = tf.range(tensor.shape[1].value) mask = tf.cast(range_[None, :] < length[:, None], tf.float32) masked = tensor * mask return tf.check_numerics(masked, 'masked')
def reset(self, indices=None): """Reset the batch of environments. Args: indices: The batch indices of the environments to reset; defaults to all. Returns: Batch tensor of the new observations. """ if indices is None: indices = tf.range(len(self._batch_env)) observ_dtype = self._parse_dtype(self._batch_env.observation_space) observ = tf.py_func(self._batch_env.reset, [indices], observ_dtype, name='reset') observ = tf.check_numerics(observ, 'observ') reward = tf.zeros_like(indices, tf.float32) done = tf.zeros_like(indices, tf.bool) with tf.control_dependencies([ tf.scatter_update(self._observ, indices, observ), tf.scatter_update(self._reward, indices, reward), tf.scatter_update(self._done, indices, done) ]): return tf.identity(observ)
def _policy_loss(self, mean, logstd, old_mean, old_logstd, action, advantage, length): """Compute the policy loss composed of multiple components. 1. The policy gradient loss is importance sampled from the data-collecting policy at the beginning of training. 2. The second term is a KL penalty between the policy at the beginning of training and the current policy. 3. Additionally, if this KL already changed more than twice the target amount, we activate a strong penalty discouraging further divergence. Args: mean: Sequences of action means of the current policy. logstd: Sequences of action log stddevs of the current policy. old_mean: Sequences of action means of the behavioral policy. old_logstd: Sequences of action log stddevs of the behavioral policy. action: Sequences of actions. advantage: Sequences of advantages. length: Batch of sequence lengths. Returns: Tuple of loss tensor and summary tensor. """ with tf.name_scope('policy_loss'): entropy = utility.diag_normal_entropy(mean, logstd) kl = tf.reduce_mean( self._mask( utility.diag_normal_kl(old_mean, old_logstd, mean, logstd), length), 1) policy_gradient = tf.exp( utility.diag_normal_logpdf(mean, logstd, action) - utility.diag_normal_logpdf(old_mean, old_logstd, action)) surrogate_loss = -tf.reduce_mean( self._mask(policy_gradient * tf.stop_gradient(advantage), length), 1) kl_penalty = self._penalty * kl cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor cutoff_count = tf.reduce_sum( tf.cast(kl > cutoff_threshold, tf.int32)) with tf.control_dependencies([ tf.cond(cutoff_count > 0, lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int) ]): kl_cutoff = (self._config.kl_cutoff_coef * tf.cast(kl > cutoff_threshold, tf.float32) * (kl - cutoff_threshold)**2) policy_loss = surrogate_loss + kl_penalty + kl_cutoff summary = tf.summary.merge([ tf.summary.histogram('entropy', entropy), tf.summary.histogram('kl', kl), tf.summary.histogram('surrogate_loss', surrogate_loss), tf.summary.histogram('kl_penalty', kl_penalty), tf.summary.histogram('kl_cutoff', kl_cutoff), tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff), tf.summary.histogram('policy_loss', policy_loss), tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)), tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)), tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss)) ]) policy_loss = tf.reduce_mean(policy_loss, 0) return tf.check_numerics(policy_loss, 'policy_loss'), summary