class DDPGController:
    """
    Deep learning agent based on Deep Deterministic Policy Gradient described in https://arxiv.org/pdf/1509.02971.pdf
    
    """
    def __init__(self, env, brain_name, config):
        """
        Constructor methods to create the controller

        Parameters
        ----------
        env - Unity environment for the agent to solve
        brain_name, string, brain name used in conjunction with the environment
        config - Dictionary containing the following keys:
        - 'num_episodes', int, number of episodes to run the agent for
        - 'gamma', float, discount rate for future rewards
        - 'tau', float, rate for the soft update of the target network
        - 'max_memory', int, size of the replay buffer in number of samples
        - 'batch_size', int, size of the batches sampled to train the model on each update
        - 'update_every', int, update frequency, in number of steps
        - 'mlp_layers', int tuple, shape of the multilayer perceptron model
        - 'learning_rate', float, learning rate for the training of the model
        - 'state_size', int
        - 'action_size', int
        - 'num_agents', int, number of agents running in parallel in the environment

        """
        self.env = env
        self.brain_name = brain_name
        self.__dict__.update(config.as_dict())
        self.trained_policy = Policy(config, self.state_size, self.action_size)
        self.target_policy = Policy(config, self.state_size, self.action_size)
        self.trained_critic = Critic(config, self.state_size, self.action_size)
        self.target_critic = Critic(config, self.state_size, self.action_size)
        # those networks will never be trained
        self.target_policy.eval()
        self.target_critic.eval()
        self.memory = AgentMemory(((self.num_agents, self.state_size),
                                   (self.num_agents, self.action_size),
                                   (self.num_agents, self.state_size),
                                   (self.num_agents, ), (self.num_agents, )),
                                  int(self.max_memory))
        self.scores = []
        self.critic_losses = []
        self.surrogates = []

        self.critic_optimizer = optim.Adam(self.trained_critic.parameters(),
                                           lr=config.learning_rate)
        self.policy_optimizer = optim.Adam(self.trained_policy.parameters(),
                                           lr=config.learning_rate)

    def solve(self):
        """
        Main method to launch the environment loop

        """
        step = 1

        for i_episode in range(1, self.num_episodes + 1):
            env_info = self.env.reset(train_mode=True)[self.brain_name]
            state = env_info.vector_observations
            rewards = []
            surrogates = []
            critic_losses = []
            while True:
                action = self.act(state)
                env_info = self.env.step(action)[self.brain_name]
                next_state = env_info.vector_observations
                reward = env_info.rewards
                done = env_info.local_done
                self.memory.add((state, action, next_state, reward, done))
                state = next_state
                rewards.append(reward)
                if self.memory.size >= self.batch_size and not step % self.update_every:
                    surrogate_buffer, critic_loss = self.train()
                    surrogates.append(surrogate_buffer)
                    critic_losses.append(critic_loss)
                step += 1
                if np.any(done):
                    break

            self.scores.append(np.mean(np.sum(rewards, axis=0)))
            self.surrogates.append(np.mean(surrogates))
            self.critic_losses.append(np.mean(critic_losses))

            self.print_status(i_episode)

        return self.scores, self.surrogates, self.critic_losses

    def act(self, states):
        """
        Based on states, returns the on-policy actions
        
        Parameter
        ---------
        states - float array shape=(num_agents, state_size)
        
        Return
        ---------
        Float array shape=(num_agents, action_size), chosen action

        """
        states = torch.from_numpy(states).float().to(device)
        self.trained_policy.eval()
        with torch.no_grad():
            actions = self.trained_policy(states)
        # TODO: add exploration noise
        return actions.cpu().data.numpy()

    def train(self):
        """
        Training routine to update the policy and critic

        """
        states, actions, next_states, rewards, dones = self.memory.sample(
            self.batch_size)

        states = torch.from_numpy(states).float().to(device)
        actions = torch.from_numpy(actions).float().to(device)
        next_states = torch.from_numpy(next_states).float().to(device)
        rewards = torch.from_numpy(rewards).float().to(device)
        dones = torch.from_numpy(dones).float().to(device)

        # critic update
        next_actions = self.target_policy(next_states)
        self.trained_critic.train()
        self.critic_optimizer.zero_grad()
        done_mask = 1 - dones
        target_states_values = rewards + self.gamma * \
            self.target_critic(next_states, next_actions) * done_mask
        predicted_states_values = self.trained_critic(states, actions)
        critic_loss = torch.mean(
            (target_states_values - predicted_states_values)**2)
        critic_loss.backward()
        self.critic_optimizer.step()

        # policy update
        self.trained_policy.train()
        self.policy_optimizer.zero_grad()
        action_values = self.trained_critic(states,
                                            self.trained_policy(states))
        surrogate = -torch.mean(action_values)
        surrogate.backward()
        self.policy_optimizer.step()

        self.target_network_update(self.trained_critic, self.target_critic)
        self.target_network_update(self.trained_policy, self.target_policy)

        return surrogate.cpu().data.numpy(), critic_loss.cpu().data.numpy()

    def target_network_update(self, trained_model, target_model):
        """
        Performs a soft update with rate tau from the trained_model to the target_model.

        """
        target_model_weights = target_model.get_weights()
        train_model_weights = trained_model.get_weights()
        new_weights = []
        for w1, w2 in zip(target_model_weights, train_model_weights):
            new_weights.append(w1 * (1 - self.tau) + w2 * self.tau)
        target_model.set_weights(new_weights)

    def print_status(self, i_episode):
        """
        Print the latest status of the agent

        Parameter
        ---------
        i_episode, int

        """
        print(
            "\rEpisode %d/%d | Average Score: %.2f | Surrogate: %.5f | Critic loss: %.5f  "
            % (i_episode, self.num_episodes, self.scores[-1],
               self.surrogates[-1], self.critic_losses[-1]),
            end="")
        sys.stdout.flush()
Example #2
0
class PPOController:
    """
    Deep learning agent based on Proximal Policy Optimization, based on https://arxiv.org/pdf/1506.02438.pdf

    """
    def __init__(self, env, brain_name, config, policy=None, critic=None):
        """
        Constructor methods to create the controller

        Parameters
        ----------
        env - Unity environment for the agent to solve
        brain_name, string, brain name used in conjunction with the environment
        config - Dictionary containing the following keys:
        - 'num_episodes', int, number of episodes to run the agent for
        - 'epsilon_start', float, initial value for epsilon used in the PPO algorithm to clip the surrogate
        - 'epsilon_decay', float, rate of decay for epsilon, applied after every episode
        - 'gamma', float, discount rate for future rewards
        - 'tau', float, rate for the soft update of the target network
        - 'max_memory', int, size of the replay buffer in number of samples
        - 'update_every', int, update frequency, in number of steps
        - 'train_iterations', int, number of training passes over a data batch
        - 'mlp_layers', int tuple, shape of the multilayer perceptron model
        - 'learning_rate', float, learning rate for the training of the model
        - 'std', float, standard deviation used for the Normal distribution of the policy
        - 'state_size', int
        - 'action_size', int
        - 'num_agents', int, number of agents running in parallel in the environment

        - 'policy', optional, used to pass a mock policy for testing purposes
        - 'critic', optional, used to pass a mock critic for testing purposes

        """
        self.env = env
        self.brain_name = brain_name
        self.__dict__.update(config.as_dict())
        self.policy = Policy(config, self.state_size,
                             self.action_size) if policy is None else policy
        self.trained_critic = Critic(
            config, self.state_size) if critic is None else critic
        self.target_critic = Critic(
            config, self.state_size) if critic is None else critic
        self.target_critic.eval()
        self.memory = AgentMemory(
            ((self.num_agents, self.state_size),
             (self.num_agents, self.action_size), (self.num_agents, ),
             (self.num_agents, self.state_size), (self.num_agents, ),
             (self.num_agents, )), int(self.max_memory))
        self.epsilon = config.epsilon_start
        self.scores = []
        self.surrogates = []

        self.optimizer = optim.Adam([{
            'params': self.policy.parameters()
        }, {
            'params': self.trained_critic.parameters()
        }],
                                    lr=config.learning_rate)

    def solve(self):
        """
        Main method to launch the environment loop

        """
        step = 1

        for i_episode in range(1, self.num_episodes + 1):
            env_info = self.env.reset(train_mode=True)[self.brain_name]
            state = env_info.vector_observations
            rewards = []
            surrogates = []
            while True:
                action, log_probability = self.act(state)
                env_info = self.env.step(action)[self.brain_name]
                next_state = env_info.vector_observations
                reward = env_info.rewards
                done = env_info.local_done
                self.memory.add(
                    (state, action, log_probability, next_state, reward, done))
                state = next_state
                rewards.append(reward)
                if not step % self.update_every:
                    surrogate_buffer = self.train_loop()
                    surrogates.append(surrogate_buffer)
                step += 1
                if np.any(done):
                    break

            self.scores.append(np.mean(np.sum(rewards, axis=0)))
            self.surrogates.append(np.mean(surrogates))

            self.epsilon *= self.epsilon_decay
            self.print_status(i_episode)

        return self.scores, self.surrogates

    def act(self, states):
        """
        Based on states, returns the on-policy actions
        
        Parameter
        ---------
        states - float array shape=(num_agents, state_size)
        
        Return
        ---------
        Float array shape=(num_agents, action_size), chosen action

        """
        states = torch.from_numpy(states).float().to(device)
        self.policy.eval()
        actions, log_probabilities = self.policy.next_actions(states)
        return actions.cpu().data.numpy(), log_probabilities.cpu().data.numpy()

    def train_loop(self):
        """
        Training routine to update the policy and critic

        """
        surrogate_buffer = []
        states, actions, old_log_probabilities, next_states, rewards, dones = self.memory.get_latest(
            self.update_every)

        future_rewards = self.compute_discounted_future_rewards(rewards)

        old_log_probabilities = torch.from_numpy(
            old_log_probabilities).float().to(device)
        states = torch.from_numpy(states).float().to(device)
        actions = torch.from_numpy(actions).float().to(device)
        next_states = torch.from_numpy(next_states).float().to(device)
        future_rewards = torch.from_numpy(future_rewards).float().to(device)
        dones = torch.from_numpy(dones).bool().to(device)
        self.policy.train()
        self.trained_critic.train()
        for _ in range(self.train_iterations):
            surrogate = self.compute_surrogate(old_log_probabilities, states,
                                               actions, next_states,
                                               future_rewards, dones)
            surrogate_buffer.append(surrogate.cpu().data.numpy())
            self.optimizer.zero_grad()
            surrogate.backward()
            self.optimizer.step()
            self.target_network_update()
        return surrogate_buffer

    def compute_surrogate(self, old_log_probabilities, states, actions,
                          next_states, future_rewards, dones):
        """
        Compute the surrogate, i.e. the function optimized at training time

        Parameters
        ----------
        - old_log_probabilities, float Tensor shape=(batch_size, num_agents), original probabilities for the performed action
        - states, float Tensor shape=(batch_size, num_agents, state_size)
        - actions, float Tensor shape=(batch_size, num_agents, action_size)
        - next_states, float Tensor shape=(batch_size, num_agents, state_size)
        - future_rewards, float Tensor shape=(batch_size, num_agents), discounted sum of future rewards over the length of the trajectory
        - dones, float Tensor shape=(batch_size, num_agents)

        Return
        ---------
        Surrogate, float Tensor

        """
        new_log_probabilities, entropy = self.policy.get_log_probabilities_and_entropy(
            states, actions)
        ratio = torch.exp(new_log_probabilities - old_log_probabilities)

        with torch.no_grad():
            states_values = self.target_critic(states)
            next_states_values = self.target_critic(next_states[-1, :])
        if torch.any(dones):
            final_states_values = 0
        else:
            final_states_values = next_states_values.expand(
                states_values.shape)

        future_rewards = self.normalize(future_rewards)

        discount = self.gamma**torch.arange(len(states_values),
                                            0,
                                            -1,
                                            dtype=torch.float).unsqueeze(1)
        target_states_values = future_rewards + final_states_values * discount
        advantages = target_states_values - states_values

        clip = torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon)
        clipped_surrogate = torch.min(ratio * advantages, clip * advantages)

        return -1 * torch.mean(
            clipped_surrogate) + 0.5 * self.trained_critic.mse(
                states_values, target_states_values) - 0.01 * entropy.mean()

    def normalize(self, a):
        """
        Normalize a torch Tensor

        Parameters
        ----------
        - a, float Tensor to normalize

        """
        mean = torch.mean(a, -1)
        std = torch.std(a, -1)
        b = a
        mask = std != 0
        b[mask] = (a[mask] - mean[mask].unsqueeze(1)) / std[mask].unsqueeze(1)
        # if the deviation is null set the normalized reward to 0
        mask = std == 0
        b[mask] = 0
        return b

    def compute_discounted_future_rewards(self, rewards):
        """
        Compute the discounted sum of future reward over the trajectory

        Parameters
        ----------
        - rewards, float array shape=(batch_size, num_agents)

        Return
        ----------
        Discounted future rewards, float array shape=(batch_size, num_agents)

        """
        # This is complex so giving an example with gamma = 0.5 and
        # rewards = [[1, 0],
        #            [1, 1]]
        main_dim = len(rewards)
        # discounts = [1, 0.5]
        discounts = (self.gamma**np.arange(main_dim))
        # discounts = [[1, 0.5],
        #              [1, 0.5]]
        discounts = np.tile(discounts, main_dim).reshape(main_dim, main_dim)
        # indexes = [[0, 1],
        #            [1, 2]]
        indexes = np.tile(np.arange(main_dim), main_dim).reshape(
            main_dim, main_dim) + np.arange(main_dim)[:, np.newaxis]
        # indexes = [[0, 1],
        #            [1, 0]]
        indexes = np.mod(indexes, main_dim)
        # discounts = [[1, 0.5],
        #              [0, 1]]
        discounts = np.triu(discounts[range(main_dim), indexes])
        # rewards = [[1.5, 0.5],
        #              [1, 1]]
        return np.dot(discounts, rewards)

    def target_network_update(self):
        """
        Performs a soft update with rate tau from the trained_model to the target_model.

        """
        target_model_weights = self.target_critic.get_weights()
        train_model_weights = self.trained_critic.get_weights()
        new_weights = []
        for w1, w2 in zip(target_model_weights, train_model_weights):
            new_weights.append(w1 * (1 - self.tau) + w2 * self.tau)
        self.target_critic.set_weights(new_weights)

    def print_status(self, i_episode):
        """
        Print the latest status of the agent

        Parameter
        ---------
        i_episode, int

        """
        print(
            "\rEpisode %d/%d | Average Score: %.2f | Model surrogate: %.5f   "
            % (i_episode, self.num_episodes, self.scores[-1],
               self.surrogates[-1]),
            end="")
        sys.stdout.flush()