class VPG: def __init__(self, env, policy_fn, lr, replay_buffer_size, policy_update_batch_size): self.env = env self.policy = policy_fn() self.policy_update_batch_size = policy_update_batch_size self.replay_buffer = OnePassReplayBuffer( buffer_size=replay_buffer_size, store_fields=[ ReplayField('observation', shape=self.env.observation_space.shape, dtype=self.env.observation_space.dtype), ReplayField('action', shape=self.env.action_space.shape, dtype=self.env.action_space.dtype), ReplayField('reward'), ReplayField('done', dtype=np.bool), ], compute_fields=[EpisodeReturn()], ) self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr) def variables_to_checkpoint(self): return {'policy': self.policy, 'optimizer': self.optimizer} def step(self, previous_transition=None, training=False): observation = previous_transition[ 'observation_next'] if previous_transition else self.env.reset() action = self.policy.sample(tf.expand_dims(observation, axis=0)).numpy()[0] observation_next, reward, done, _ = self.env.step(action) transition = { 'observation': observation, 'observation_next': observation_next, 'action': action, 'reward': reward, 'done': done } if training: self.replay_buffer.store_transition(transition) return transition def update(self): dataset = self.replay_buffer.as_dataset(self.policy_update_batch_size) result = { 'policy_loss': self._update_policy(dataset), } self.replay_buffer.purge() return result def _update_policy(self, dataset): gradient_acc = GradientAccumulator() loss_acc = MeanAccumulator() for data in dataset: gradients, loss = self._update_policy_step(data) gradient_acc.add(gradients, tf.size(loss)) loss_acc.add(loss) self.optimizer.apply_gradients( zip(gradient_acc.gradients(), self.policy.trainable_variables)) return loss_acc.value() @tf.function(experimental_relax_shapes=True) def _update_policy_step(self, data): observation, action, episode_return = data['observation'], data[ 'action'], data['episode_return'] episode_return = tf_standardize(episode_return) with tf.GradientTape() as tape: log_probs = self.policy.log_prob(observation, action) loss = -(log_probs * episode_return) gradients = tape.gradient(loss, self.policy.trainable_variables) return gradients, loss
class PPOPenalty: def __init__(self, env, policy_fn, vf_fn, lr_policy, lr_vf, gamma, lambda_, beta, kl_target, kl_tolerance, beta_update_factor, vf_update_iterations, policy_update_iterations, policy_update_batch_size, vf_update_batch_size, replay_buffer_size): self.env = env self.policy = policy_fn() self.policy_old = policy_fn() self.policy_old.set_weights(self.policy.get_weights()) self.vf = vf_fn() self.lr_policy = lr_policy self.lr_vf = lr_vf self.gamma = gamma self.lambda_ = lambda_ self.beta = beta self.kl_target = kl_target self.kl_tolerance = kl_tolerance self.beta_update_factor = beta_update_factor self.vf_update_iterations = vf_update_iterations self.policy_update_iterations = policy_update_iterations self.policy_update_batch_size = policy_update_batch_size self.vf_update_batch_size = vf_update_batch_size self.replay_buffer = OnePassReplayBuffer( buffer_size=replay_buffer_size, store_fields=[ ReplayField('observation', shape=self.env.observation_space.shape, dtype=self.env.observation_space.dtype), ReplayField('action', shape=self.env.action_space.shape, dtype=self.env.action_space.dtype), ReplayField('reward'), ReplayField('value'), ReplayField('value_next'), ReplayField('done', dtype=np.bool), ], compute_fields=[ Advantage(gamma=gamma, lambda_=lambda_), RewardToGo(gamma=gamma), ], ) self.policy_optimizer = tf.keras.optimizers.Adam( learning_rate=self.lr_policy) self.vf_optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr_vf) def variables_to_checkpoint(self): return { 'policy': self.policy, 'policy_old': self.policy_old, 'vf': self.vf, 'policy_optimizer': self.policy_optimizer, 'vf_optimizer': self.vf_optimizer } def step(self, previous_transition=None, training=False): observation = previous_transition[ 'observation_next'] if previous_transition else self.env.reset() value = previous_transition['value_next'] if previous_transition else \ self.vf.compute(tf.expand_dims(observation, axis=0)).numpy()[0, 0] action = self.policy.sample(tf.expand_dims(observation, axis=0)).numpy()[0] observation_next, reward, done, _ = self.env.step(action) value_next = self.vf.compute(tf.expand_dims(observation_next, axis=0)).numpy()[0, 0] transition = { 'observation': observation, 'observation_next': observation_next, 'action': action, 'reward': reward, 'value': value, 'value_next': value_next, 'done': done } if training: self.replay_buffer.store_transition(transition) return transition def update(self): result = { 'policy_loss': self._update_policy( self.replay_buffer.as_dataset(self.policy_update_batch_size)), 'vf_loss': self._update_vf( self.replay_buffer.as_dataset(self.vf_update_batch_size)), } self.replay_buffer.purge() return result def _update_policy(self, dataset): loss_acc = MeanAccumulator() for i in range(self.policy_update_iterations): for data in dataset: gradients, loss = self._update_policy_step(data) self.policy_optimizer.apply_gradients( zip(gradients, self.policy.trainable_variables)) loss_acc.add(loss) kl_acc = MeanAccumulator() for data in dataset: distribution_old = self.policy_old.distribution( data['observation']) distribution = self.policy.distribution(data['observation']) kl = tfp.distributions.kl_divergence(distribution_old, distribution) kl_acc.add(kl) if kl_acc.value() < self.kl_target / self.kl_tolerance: self.beta /= self.beta_update_factor elif kl_acc.value() > self.kl_target * self.kl_tolerance: self.beta *= self.beta_update_factor self.policy_old.set_weights(self.policy.get_weights()) return loss_acc.value() @tf.function(experimental_relax_shapes=True) def _update_policy_step(self, data): observation, action, advantage = data['observation'], data[ 'action'], data['advantage'] advantage = tf_standardize(advantage) distribution_old = self.policy_old.distribution(observation) log_probs_old = distribution_old.log_prob(action) with tf.GradientTape() as tape: distribution = self.policy.distribution(observation) log_probs = distribution.log_prob(action) importance_sampling_weight = tf.exp(log_probs - log_probs_old) kl = tf.reduce_mean( tfp.distributions.kl_divergence(distribution_old, distribution)) loss = -tf.reduce_mean(importance_sampling_weight * advantage - self.beta * kl) gradients = tape.gradient(loss, self.policy.trainable_variables) return gradients, loss def _update_vf(self, dataset): loss_acc = MeanAccumulator() for i in range(self.vf_update_iterations): gradient_acc = GradientAccumulator() for data in dataset: gradients, loss = self._update_vf_step(data) gradient_acc.add(gradients, tf.size(loss)) loss_acc.add(loss) self.vf_optimizer.apply_gradients( zip(gradient_acc.gradients(), self.vf.trainable_variables)) return loss_acc.value() @tf.function(experimental_relax_shapes=True) def _update_vf_step(self, data): observation, reward_to_go = data['observation'], data['reward_to_go'] with tf.GradientTape() as tape: values = self.vf.compute(observation) loss = tf.math.squared_difference(reward_to_go, tf.squeeze(values)) gradients = tape.gradient(loss, self.vf.trainable_variables) return gradients, loss
class VPGGAE: def __init__(self, env, policy_fn, vf_fn, lr_policy, lr_vf, gamma, lambda_, vf_update_iterations, policy_update_batch_size, vf_update_batch_size, replay_buffer_size): self.env = env self.policy = policy_fn() self.vf = vf_fn() self.vf_update_iterations = vf_update_iterations self.policy_update_batch_size = policy_update_batch_size self.vf_update_batch_size = vf_update_batch_size self.replay_buffer = OnePassReplayBuffer( buffer_size=replay_buffer_size, store_fields=[ ReplayField('observation', shape=self.env.observation_space.shape, dtype=self.env.observation_space.dtype), ReplayField('action', shape=self.env.action_space.shape, dtype=self.env.action_space.dtype), ReplayField('reward'), ReplayField('value'), ReplayField('value_next'), ReplayField('done', dtype=np.bool), ], compute_fields=[ Advantage(gamma=gamma, lambda_=lambda_), RewardToGo(gamma=gamma), ], ) self.policy_optimizer = tf.keras.optimizers.Adam( learning_rate=lr_policy) self.vf_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_vf) def variables_to_checkpoint(self): return { 'policy': self.policy, 'vf': self.vf, 'policy_optimizer': self.policy_optimizer, 'vf_optimizer': self.vf_optimizer } def step(self, previous_transition=None, training=False): observation = previous_transition[ 'observation_next'] if previous_transition else self.env.reset() value = previous_transition['value_next'] if previous_transition else \ self.vf.compute(tf.expand_dims(observation, axis=0)).numpy()[0, 0] action = self.policy.sample(tf.expand_dims(observation, axis=0)).numpy()[0] observation_next, reward, done, _ = self.env.step(action) value_next = self.vf.compute(tf.expand_dims(observation_next, axis=0)).numpy()[0, 0] transition = { 'observation': observation, 'observation_next': observation_next, 'action': action, 'reward': reward, 'value': value, 'value_next': value_next, 'done': done } if training: self.replay_buffer.store_transition(transition) return transition def update(self): result = { 'policy_loss': self._update_policy( self.replay_buffer.as_dataset(self.policy_update_batch_size)), 'vf_loss': self._update_vf( self.replay_buffer.as_dataset(self.vf_update_batch_size)), } self.replay_buffer.purge() return result def _update_policy(self, dataset): gradient_acc = GradientAccumulator() loss_acc = MeanAccumulator() for data in dataset: gradients, loss = self._update_policy_step(data) gradient_acc.add(gradients, tf.size(loss)) loss_acc.add(loss) self.policy_optimizer.apply_gradients( zip(gradient_acc.gradients(), self.policy.trainable_variables)) return loss_acc.value() @tf.function(experimental_relax_shapes=True) def _update_policy_step(self, data): observation, action, advantage = data['observation'], data[ 'action'], data['advantage'] advantage = tf_standardize(advantage) with tf.GradientTape() as tape: log_probs = self.policy.log_prob(observation, action) loss = -(log_probs * advantage) gradients = tape.gradient(loss, self.policy.trainable_variables) return gradients, loss def _update_vf(self, dataset): loss_acc = MeanAccumulator() for i in range(self.vf_update_iterations): gradient_acc = GradientAccumulator() for data in dataset: gradients, loss = self._update_vf_step(data) gradient_acc.add(gradients, tf.size(loss)) loss_acc.add(loss) self.vf_optimizer.apply_gradients( zip(gradient_acc.gradients(), self.vf.trainable_variables)) return loss_acc.value() @tf.function(experimental_relax_shapes=True) def _update_vf_step(self, data): observation, reward_to_go = data['observation'], data['reward_to_go'] with tf.GradientTape() as tape: values = self.vf.compute(observation) loss = tf.math.squared_difference(reward_to_go, tf.squeeze(values)) gradients = tape.gradient(loss, self.vf.trainable_variables) return gradients, loss
class TRPO: def __init__(self, env, policy_fn, vf_fn, lr_vf, gamma, lambda_, delta, replay_buffer_size, policy_update_batch_size, vf_update_batch_size, vf_update_iterations, conjugate_gradient_iterations, conjugate_gradient_tol, line_search_iterations, line_search_coefficient): self.env = env self.policy = policy_fn() self.vf = vf_fn() self.gamma = gamma self.lambda_ = lambda_ self.delta = delta self.vf_update_iterations = vf_update_iterations self.policy_update_batch_size = policy_update_batch_size self.vf_update_batch_size = vf_update_batch_size self.conjugate_gradient_iterations = conjugate_gradient_iterations self.conjugate_gradient_tol = conjugate_gradient_tol self.line_search_iterations = line_search_iterations self.line_search_coefficient = line_search_coefficient self.replay_buffer = OnePassReplayBuffer( buffer_size=replay_buffer_size, store_fields=[ ReplayField('observation', shape=self.env.observation_space.shape, dtype=self.env.observation_space.dtype), ReplayField('action', shape=self.env.action_space.shape, dtype=self.env.action_space.dtype), ReplayField('reward'), ReplayField('value'), ReplayField('value_next'), ReplayField('done', dtype=np.bool), ], compute_fields=[ Advantage(gamma=gamma, lambda_=lambda_), RewardToGo(gamma=gamma), ], ) self.vf_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_vf) def variables_to_checkpoint(self): return { 'policy': self.policy, 'vf': self.vf, 'vf_optimizer': self.vf_optimizer } def step(self, previous_transition=None, training=False): observation = previous_transition[ 'observation_next'] if previous_transition else self.env.reset() value = previous_transition['value_next'] if previous_transition else \ self.vf.compute(tf.expand_dims(observation, axis=0)).numpy()[0, 0] action = self.policy.sample(tf.expand_dims(observation, axis=0)).numpy()[0] observation_next, reward, done, _ = self.env.step(action) value_next = self.vf.compute(tf.expand_dims(observation_next, axis=0)).numpy()[0, 0] transition = { 'observation': observation, 'observation_next': observation_next, 'action': action, 'reward': reward, 'value': value, 'value_next': value_next, 'done': done } if training: self.replay_buffer.store_transition(transition) return transition def update(self): result = { 'policy_loss': self._update_policy( self.replay_buffer.as_dataset(self.policy_update_batch_size)), 'vf_loss': self._update_vf( self.replay_buffer.as_dataset(self.vf_update_batch_size)), } self.replay_buffer.purge() return result def _update_policy(self, dataset): loss_acc = MeanAccumulator() for data in dataset: # TODO: is batching here correct? observation, action, advantage = data['observation'], data[ 'action'], data['advantage'] advantage = tf_standardize(advantage) log_probs_old = self.policy.log_prob(observation, action) distribution_old = self.policy.distribution(observation) with tf.GradientTape() as tape: loss_old = self._surrogate_loss(observation, action, advantage, log_probs_old) gradients = tape.gradient(loss_old, self.policy.trainable_variables) gradients = tf.concat([tf.reshape(g, [-1]) for g in gradients], axis=0) Ax = lambda v: self._fisher_vector_product(v, observation, distribution_old) step_direction = self._conjugate_gradient(Ax, gradients) loss = self._line_search(observation, action, advantage, Ax, step_direction, distribution_old, log_probs_old, loss_old) loss_acc.add(loss) return loss_acc.value() def _surrogate_loss(self, observation, action, advantage, log_probs_old): log_probs = self.policy.log_prob(observation, action) importance_sampling_weight = tf.exp(log_probs - log_probs_old) return -tf.reduce_mean(importance_sampling_weight * advantage) def _kl_divergence(self, observation, distribution_old): distribution = self.policy.distribution(observation) return tf.reduce_mean( tfp.distributions.kl_divergence(distribution_old, distribution)) def _fisher_vector_product(self, v, observation, distribution_old): with tf.GradientTape(persistent=True) as tape: kl = self._kl_divergence(observation, distribution_old) gradients = tape.gradient(kl, self.policy.trainable_variables) gradients = tf.concat([tf.reshape(g, [-1]) for g in gradients], axis=0) grad_vector_product = tf.reduce_sum(gradients * v) hessian_vector_product = tape.gradient( grad_vector_product, self.policy.trainable_variables) hessian_vector_product = tf.concat( [tf.reshape(g, [-1]) for g in hessian_vector_product], axis=0) return hessian_vector_product def _conjugate_gradient(self, Ax, b): x = tf.zeros_like(b) r = tf.identity(b) p = tf.identity(b) r2 = tf.tensordot(r, r, 1) for _ in tf.range(self.conjugate_gradient_iterations): z = Ax(p) alpha = r2 / (tf.tensordot(p, z, 1) + 1e-8) x += alpha * p r -= alpha * z r2_i = tf.tensordot(r, r, 1) p = r + (r2_i / r2) * p r2 = r2_i if r2 < self.conjugate_gradient_tol: break return x def _line_search(self, observation, action, advantage, Ax, step_direction, distribution_old, log_probs_old, loss_old): sAs = tf.tensordot(step_direction, Ax(step_direction), 1) beta = tf.math.sqrt((2 * self.delta) / (sAs + 1e-8)) theta_old = self.policy.get_weights() shapes = [w.shape for w in theta_old] step_direction = tf.split(step_direction, [tf.reduce_prod(s) for s in shapes]) step_direction = [ tf.reshape(sd, s) for sd, s in zip(step_direction, shapes) ] for i in range(self.line_search_iterations): theta = [ w - beta * sd * (self.line_search_coefficient**i) for w, sd in zip(theta_old, step_direction) ] self.policy.set_weights(theta) kl = self._kl_divergence(observation, distribution_old) loss = self._surrogate_loss(observation, action, advantage, log_probs_old) if kl <= self.delta and loss <= loss_old: return loss if i == self.line_search_iterations - 1: self.policy.set_weights(theta_old) return loss_old def _update_vf(self, dataset): loss_acc = MeanAccumulator() for i in range(self.vf_update_iterations): gradient_acc = GradientAccumulator() for data in dataset: gradients, loss = self._update_vf_step(data) gradient_acc.add(gradients, tf.size(loss)) loss_acc.add(loss) self.vf_optimizer.apply_gradients( zip(gradient_acc.gradients(), self.vf.trainable_variables)) return loss_acc.value() @tf.function(experimental_relax_shapes=True) def _update_vf_step(self, data): observation, reward_to_go = data['observation'], data['reward_to_go'] with tf.GradientTape() as tape: values = self.vf.compute(observation) loss = tf.math.squared_difference(reward_to_go, tf.squeeze(values)) gradients = tape.gradient(loss, self.vf.trainable_variables) return gradients, loss