コード例 #1
0
class PPOAgent():
    def __init__(self,
                 rows,
                 columns,
                 num_actions,
                 l_rate=1e-4,
                 gamma=0.99,
                 lam=0.95,
                 policy_kl_range=0.0008,
                 policy_params=20,
                 value_clip=1.0,
                 loss_coefficient=1.0,
                 entropy_coefficient=0.05):
        self.rows = rows
        self.columns = columns
        self.num_actions = num_actions

        self.actor = Actor(self.num_actions)
        self.critic = Critic()
        self.actor_old = Actor(self.num_actions)
        self.critic_old = Critic()

        self.optimizer = tf.keras.optimizers.Adam(l_rate)

        self.gamma = gamma
        self.lam = lam
        self.policy_kl_range = policy_kl_range
        self.policy_params = policy_params
        self.value_clip = value_clip
        self.loss_coefficient = loss_coefficient
        self.entropy_coefficient = entropy_coefficient

    @tf.function
    def fit(self, states, actions, rewards, next_states, dones):
        with tf.GradientTape() as tape:
            action_probabilities, values = self.actor(states), self.critic(
                states)
            old_action_probabilities, old_values = self.actor_old(
                states), self.critic_old(states)
            next_values = self.critic(next_states)
            loss = self._get_loss(action_probabilities, values,
                                  old_action_probabilities, old_values,
                                  next_values, actions, rewards, dones)
        grads = tape.gradient(
            loss,
            self.actor.trainable_variables + self.critic.trainable_variables)
        self.optimizer.apply_gradients(
            zip(
                grads, self.actor.trainable_variables +
                self.critic.trainable_variables))

    def select_action(self, state, training=False):
        state_in = tf.expand_dims(state, axis=0)
        probabilities = self.actor(state_in)

        if training:
            distribution = tfp.distributions.Categorical(probs=probabilities)
            action = distribution.sample()
            action = int(action[0])
        else:
            action = tf.argmax(probabilities[0]).numpy()

        return action

    def get_model(self):
        return self.actor

    def update_networks(self):
        self.actor_old.set_weights(self.actor.get_weights())
        self.critic_old.set_weights(self.critic.get_weights())

    def save_model_weights(self, actor_filename, critic_filename):
        self.actor.save_weights(actor_filename)
        self.critic.save_weights(critic_filename)

    def load_model_weights(self, actor_filename, critic_filename=None):
        self.actor(np.zeros((1, self.rows, self.columns, 1)))
        self.actor.load_weights(actor_filename)
        self.actor_old(np.zeros((1, self.rows, self.columns, 1)))
        self.actor_old.load_weights(actor_filename)

        if critic_filename is not None:
            self.critic(np.zeros((1, self.rows, self.columns, 1)))
            self.critic.load_weights(critic_filename)
            self.critic_old(np.zeros((1, self.rows, self.columns, 1)))
            self.critic_old.load_weights(critic_filename)

    def save_optimizer_weights(self, filename):
        np.save(filename, self.optimizer.get_weights())

    def load_optimizer_weights(self, filename):
        optimizer_weights = np.load(filename, allow_pickle=True)
        model_weights = self.actor.trainable_weights + self.critic.trainable_variables
        zero_grads = [tf.zeros_like(w) for w in model_weights]
        self.optimizer.apply_gradients(zip(zero_grads, model_weights))
        self.optimizer.set_weights(optimizer_weights)

    def _get_loss(self, action_probabilities, values, old_action_probabilities,
                  old_values, next_values, actions, rewards, dones):
        old_values = tf.stop_gradient(old_values)

        advantages = self._generalized_advantages_estimation(
            values, rewards, next_values, dones)
        returns = tf.stop_gradient(advantages + values)
        advantages = \
            tf.stop_gradient((advantages - tf.math.reduce_mean(advantages)) / (tf.math.reduce_std(advantages) + 1e-7))

        log_probabilities = self._log_probabilities(action_probabilities,
                                                    actions)
        old_log_probabilities = tf.stop_gradient(
            self._log_probabilities(old_action_probabilities, actions))
        ratios = tf.math.exp(log_probabilities - old_log_probabilities)

        kl_divergence = self._kl_divergence(old_action_probabilities,
                                            action_probabilities)

        policy_gradient_loss = tf.where(
            tf.logical_and(kl_divergence >= self.policy_kl_range, ratios > 1),
            ratios * advantages - self.policy_params * kl_divergence,
            ratios * advantages)
        policy_gradient_loss = tf.math.reduce_mean(policy_gradient_loss)

        entropy = tf.math.reduce_mean(self._entropy(action_probabilities))

        clipped_values = old_values + tf.clip_by_value(
            values - old_values, -self.value_clip, self.value_clip)
        values_losses = tf.math.square(returns - values) * 0.5
        clipped_values_losses = tf.math.square(returns - clipped_values) * 0.5

        critic_loss = tf.math.reduce_mean(
            tf.math.maximum(values_losses, clipped_values_losses))
        loss = (critic_loss * self.loss_coefficient) - policy_gradient_loss - (
            entropy * self.entropy_coefficient)

        return loss

    def _generalized_advantages_estimation(self, values, rewards, next_values,
                                           dones):
        gae = 0
        advantages = []
        delta = rewards + (1.0 - dones) * self.gamma * next_values - values
        for i in reversed(range(len(rewards))):
            gae = delta[i] + (1.0 - dones[i]) * self.gamma * self.lam * gae
            advantages.insert(0, gae)

        return tf.stack(advantages)

    def _log_probabilities(self, action_probabilities, actions):
        distribution = tfp.distributions.Categorical(
            probs=action_probabilities)
        return tf.expand_dims(distribution.log_prob(actions), axis=1)

    def _kl_divergence(self, probabilities1, probabilities2):
        distribution1 = tfp.distributions.Categorical(probs=probabilities1)
        distribution2 = tfp.distributions.Categorical(probs=probabilities2)
        return tf.expand_dims(tfp.distributions.kl_divergence(
            distribution1, distribution2),
                              axis=1)

    def _entropy(self, probabilities):
        distribution = tfp.distributions.Categorical(probs=probabilities)
        return distribution.entropy()
コード例 #2
0
class DDPG:
    """ Deep Deterministic Policy Gradient (DDPG) Helper Class"""

    def __init__(self, act_dim, env_dim, act_range, buffer_size=20000, gamma=0.99, lr=0.00005, tau=0.001):
        """Initialization"""
        # Environment and A2C parameters
        self.act_dim = act_dim
        self.act_range = act_range
        self.env_dim = env_dim
        self.gamma = gamma
        self.lr = lr
        # Create actor and critic networks
        self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau)
        self.critic = Critic(self.env_dim, act_dim, lr, tau)
        self.buffer = MemoryBuffer(buffer_size)

    def policy_action(self, s):
        """Use the actor to predict value"""
        return self.actor.predict(s)[0]

    def bellman(self, rewards, q_values, dones):
        """Use the Bellman Equation to compute the critic target"""
        critic_target = np.asarray(q_values)
        for i in range(q_values.shape[0]):
            if dones[i]:
                critic_target[i] = rewards[i]
            else:
                critic_target[i] = rewards[i] + self.gamma * q_values[i]
        return critic_target

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer"""
        self.buffer.memorize(state, action, reward, done, new_state)

    def sample_batch(self, batch_size):
        return self.buffer.sample_batch(batch_size)

    def update_models(self, states, actions, critic_target):
        """ Update actor and critic networks from sampled experience"""
        # Train critic
        self.critic.train_on_batch(states, actions, critic_target)
        # Q-Value Gradients under Current Policy
        actions = self.actor.model.predict(states)
        grads = self.critic.gradients(states, actions)
        # Train actor
        self.actor.train(states, actions, np.array(
            grads).reshape((-1, self.act_dim)))
        # Transfer weights to target networks at rate Tau
        self.actor.transfer_weights()
        self.critic.transfer_weights()

    def train(self, env, summary_writer, nb_episodes=12, batch_size=64, render=False, gather_train_stats=False):
        results = []

        # First, gather experience
        tqdm_e = tqdm(range(nb_episodes),
                      desc='Score', leave=True, unit=" episodes")
        for e in tqdm_e:
            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            actions, states, rewards = [], [], []
            noise = OrnsteinUhlenbeckProcess(size=self.act_dim)
            while not done:
                if render:
                    env.render()
                # Actor picks an action (following the deterministic policy)
                a = self.policy_action(old_state)
                # Clip continuous values to be valid w.r.t. environment
                a = np.clip(a+noise.generate(time), -
                            self.act_range, self.act_range)
                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a)
                # Add outputs to memory buffer
                self.memorize(old_state, a, r, done, new_state)
                # Sample experience from buffer
                states, actions, rewards, dones, new_states, _ = self.sample_batch(
                    batch_size)
                # Predict target q-values using target networks
                q_values = self.critic.target_predict(
                    [new_states, self.actor.target_predict(new_states)])
                # Compute critic target
                critic_target = self.bellman(rewards, q_values, dones)
                # Train both networks on sampled batch, update target networks
                self.update_models(states, actions, critic_target)
                # Update current state
                old_state = new_state
                cumul_reward += r
                time += 1

            # Gather stats every episode for plotting
            if(gather_train_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tf_summary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()
            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()

        return results

    def save_weights(self, path):
        path += '_LR_{}'.format(self.lr)
        self.actor.save(path)
        self.critic.save(path)

    def load_weights(self, path_actor, path_critic):
        self.critic.load_weights(path_critic)
        self.actor.load_weights(path_actor)