コード例 #1
0
class DQN_Agent(Base_Agent):
    agent_name = "DQN"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed)
        self.q_network_local = Neural_Network(self.state_size, self.action_size, config.seed, self.hyperparameters, "VANILLA_NN").to(self.device)
        self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"])

    def step(self):
        """Runs a step within a game including a learning step if required"""
        while not self.done:
            self.pick_and_conduct_action()
            self.update_next_state_reward_done_and_score()
            if self.time_for_q_network_to_learn():
                self.q_network_learn()
            self.save_experience()
            self.state = self.next_state #this is to set the state for the next iteration
            self.episode_step_number += 1
        self.episode_number += 1

    def pick_action(self):
        """Uses the local Q network and an epsilon greedy policy to pick an action"""

        # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add
        # a "fake" dimension to make it a mini-batch rather than a single observation
        state = torch.from_numpy(self.state).float().unsqueeze(0).to(self.device)

        self.q_network_local.eval() #puts network in evaluation mode
        with torch.no_grad():
            action_values = self.q_network_local(state)
        self.q_network_local.train() #puts network back in training mode

        action = self.make_epsilon_greedy_choice(action_values)
        return action

    def make_epsilon_greedy_choice(self, action_values):
        epsilon = self.hyperparameters["epsilon"] / (1.0 + (self.episode_number / self.hyperparameters["epsilon_decay_rate_denominator"]))

        if random.random() > epsilon:
            return np.argmax(action_values.data.cpu().numpy())
        return random.choice(np.arange(self.action_size))

    def q_network_learn(self, experiences_given=False, experiences=None):

        if not experiences_given:
            states, actions, rewards, next_states, dones = self.sample_experiences() #Sample experiences
        else:
            states, actions, rewards, next_states, dones = experiences

        loss = self.compute_loss(states, next_states, rewards, actions, dones)
        if self.done: #we only update the learning rate at end of each episode
            self.update_learning_rate(self.hyperparameters["learning_rate"], self.q_network_optimizer)
        self.take_optimisation_step(self.q_network_optimizer, self.q_network_local, loss, self.hyperparameters["gradient_clipping_norm"])

    def compute_loss(self, states, next_states, rewards, actions, dones):
        with torch.no_grad():
            Q_targets = self.compute_q_targets(next_states, rewards, dones)
        Q_expected = self.compute_expected_q_values(states, actions)
        loss = F.mse_loss(Q_expected, Q_targets)
        return loss

    def compute_q_targets(self, next_states, rewards, dones):
        Q_targets_next = self.compute_q_values_for_next_states(next_states)
        Q_targets = self.compute_q_values_for_current_states(rewards, Q_targets_next, dones)
        return Q_targets

    def compute_q_values_for_next_states(self, next_states):
        Q_targets_next = self.q_network_local(next_states).detach().max(1)[0].unsqueeze(1)
        return Q_targets_next

    def compute_q_values_for_current_states(self, rewards, Q_targets_next, dones):
        Q_targets_current = rewards + (self.hyperparameters["discount_rate"] * Q_targets_next * (1 - dones))
        return Q_targets_current

    def compute_expected_q_values(self, states, actions):
        Q_expected = self.q_network_local(states).gather(1, actions.long()) #must convert actions to long so can be used as index
        return Q_expected

    def locally_save_policy(self):
        pass
        # torch.save(self.qnetwork_local.state_dict(), "Models/{}_local_network.pt".format(self.agent_name))

    def time_for_q_network_to_learn(self):
        return self.right_amount_of_steps_taken() and self.enough_experiences_to_learn_from()

    def right_amount_of_steps_taken(self):
        return self.episode_step_number % self.hyperparameters["update_every_n_steps"] == 0

    def sample_experiences(self):
        experiences = self.memory.sample()
        states, actions, rewards, next_states, dones = experiences
        return states, actions, rewards, next_states, dones
コード例 #2
0
ファイル: REINFORCE_Agent.py プロジェクト: johede3/AI
class REINFORCE_Agent(Base_Agent):
    agent_name = "REINFORCE"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.policy = Neural_Network(self.state_size, self.action_size,
                                     config.seed, self.hyperparameters,
                                     "VANILLA_NN").to(self.device)
        self.optimizer = optim.Adam(self.policy.parameters(),
                                    lr=self.hyperparameters["learning_rate"])
        self.episode_rewards = []
        self.episode_log_probabilities = []

    def reset_game(self):
        """Resets the game information so we are ready to play a new episode"""
        self.environment.reset_environment()
        self.state = self.environment.get_state()
        self.next_state = None
        self.action = None
        self.reward = None
        self.done = False
        self.total_episode_score_so_far = 0
        self.episode_rewards = []
        self.episode_log_probabilities = []
        self.episode_step_number = 0

    def step(self):
        """Runs a step within a game including a learning step if required"""
        while not self.done:
            self.pick_and_conduct_action_and_save_log_probabilities()
            self.update_next_state_reward_done_and_score()
            self.store_reward()
            if self.time_to_learn():
                self.actor_learn()
            self.state = self.next_state  #this is to set the state for the next iteration
            self.episode_step_number += 1
        self.episode_number += 1

    def pick_and_conduct_action_and_save_log_probabilities(self):
        action, log_probabilities = self.pick_action_and_get_log_probabilities(
        )
        self.store_log_probabilities(log_probabilities)
        self.store_action(action)
        self.conduct_action()

    def pick_action_and_get_log_probabilities(self):

        # PyTorch only accepts mini-batches and not individual observations so we have to add
        # a "fake" dimension to our observation using unsqueeze
        state = torch.from_numpy(self.state).float().unsqueeze(0).to(
            self.device)
        action_probabilities = self.policy.forward(state).cpu()
        action_distribution = Categorical(
            action_probabilities)  # this creates a distribution to sample from
        action = action_distribution.sample()
        return action.item(), action_distribution.log_prob(action)

    def store_log_probabilities(self, log_probabilities):
        self.episode_log_probabilities.append(log_probabilities)

    def store_action(self, action):
        self.action = action

    def store_reward(self):
        self.episode_rewards.append(self.reward)

    def actor_learn(self):
        total_discounted_reward = self.calculate_episode_discounted_reward()
        policy_loss = self.calculate_policy_loss_on_episode(
            total_discounted_reward)
        self.optimizer.zero_grad()
        policy_loss.backward()
        self.optimizer.step()

    def calculate_episode_discounted_reward(self):
        discounts = self.hyperparameters["discount_rate"]**np.arange(
            len(self.episode_rewards))
        total_discounted_reward = np.dot(discounts, self.episode_rewards)
        return total_discounted_reward

    def calculate_policy_loss_on_episode(self, total_discounted_reward):
        policy_loss = []
        for log_prob in self.episode_log_probabilities:
            policy_loss.append(-log_prob * total_discounted_reward)
        policy_loss = torch.cat(policy_loss).sum(
        )  # We need to add up the losses across the mini-batch to get 1 overall loss
        # policy_loss = Variable(policy_loss, requires_grad = True)
        return policy_loss

    def time_to_learn(self):
        """Tells us whether it is time for the algorithm to learn. With REINFORCE we only learn at the end of every
        episode so this just returns whether the episode is over"""
        return self.done
コード例 #3
0
ファイル: PPO_Agent.py プロジェクト: johede3/AI
class PPO_Agent(Base_Agent):
    agent_name = "PPO"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.policy_output_size = self.calculate_policy_output_size()
        self.policy_new = Neural_Network(self.state_size,
                                         self.policy_output_size,
                                         self.random_seed,
                                         self.hyperparameters,
                                         "VANILLA_NN").to(self.device)
        self.policy_old = Neural_Network(self.state_size,
                                         self.policy_output_size,
                                         self.random_seed,
                                         self.hyperparameters,
                                         "VANILLA_NN").to(self.device)
        self.max_steps_per_episode = config.environment.get_max_steps_per_episode(
        )
        self.policy_new_optimizer = optim.Adam(
            self.policy_new.parameters(),
            lr=self.hyperparameters["learning_rate"])
        self.episode_number = 0
        self.many_episode_states = []
        self.many_episode_actions = []
        self.many_episode_rewards = []
        self.experience_generator = Parallel_Experience_Generator(
            self.environment, self.policy_new, self.random_seed,
            self.hyperparameters)

    def calculate_policy_output_size(self):
        """Initialises the policies"""
        if self.action_types == "DISCRETE":
            return self.action_size
        elif self.action_types == "CONTINUOUS":
            return self.action_size * 2  #Because we need 1 parameter for mean and 1 for std of distribution

    def step(self):
        self.many_episode_states, self.many_episode_actions, self.many_episode_rewards = self.experience_generator.play_n_episodes(
            self.hyperparameters["episodes_per_learning_round"])
        self.episode_number += self.hyperparameters[
            "episodes_per_learning_round"]
        self.policy_learn()
        self.update_learning_rate(self.hyperparameters["learning_rate"],
                                  self.policy_new_optimizer)
        self.equalise_policies()

    def policy_learn(self):
        """A learning round for the policy"""
        all_discounted_returns = self.calculate_all_discounted_returns()
        if self.hyperparameters["normalise_rewards"]:
            all_discounted_returns = normalise_rewards(all_discounted_returns)
        for _ in range(self.hyperparameters["learning_iterations_per_round"]):
            all_ratio_of_policy_probabilities = self.calculate_all_ratio_of_policy_probabilities(
            )
            loss = self.calculate_loss([all_ratio_of_policy_probabilities],
                                       all_discounted_returns)
            self.take_policy_new_optimisation_step(loss)

    def calculate_all_discounted_returns(self):
        all_discounted_returns = []
        for episode in range(len(self.many_episode_states)):
            discounted_returns = [0]
            for ix in range(len(self.many_episode_states[episode])):
                return_value = self.many_episode_rewards[episode][-(
                    ix + 1)] + self.hyperparameters[
                        "discount_rate"] * discounted_returns[-1]
                discounted_returns.append(return_value)
            discounted_returns = discounted_returns[1:]
            all_discounted_returns.extend(discounted_returns[::-1])
        return all_discounted_returns

    def calculate_all_ratio_of_policy_probabilities(self):

        all_states = [
            state for states in self.many_episode_states for state in states
        ]
        all_actions = [
            action for actions in self.many_episode_actions
            for action in actions
        ]

        all_states = torch.stack([
            torch.Tensor(states).float().to(self.device)
            for states in all_states
        ])
        all_actions = torch.stack([
            torch.Tensor(actions).float().to(self.device)
            for actions in all_actions
        ])
        all_actions = all_actions.view(-1, len(all_states))

        new_policy_distribution_log_prob = self.calculate_log_probability_of_actions(
            self.policy_new, all_states, all_actions)
        old_policy_distribution_log_prob = self.calculate_log_probability_of_actions(
            self.policy_old, all_states, all_actions)
        ratio_of_policy_probabilities = torch.exp(
            new_policy_distribution_log_prob) / (
                torch.exp(old_policy_distribution_log_prob) + 1e-8)
        return ratio_of_policy_probabilities

    def calculate_log_probability_of_actions(self, policy, states, actions):
        """Calculates the log probability of an action occuring given a policy and starting state"""
        policy_output = policy.forward(states).to(self.device)
        policy_distribution = create_actor_distribution(
            self.action_types, policy_output, self.action_size)
        actions_tensor = actions
        policy_distribution_log_prob = policy_distribution.log_prob(
            actions_tensor)
        return policy_distribution_log_prob

    def calculate_loss(self, all_ratio_of_policy_probabilities,
                       all_discounted_returns):
        """Calculates the PPO loss"""
        all_ratio_of_policy_probabilities = torch.squeeze(
            torch.stack(all_ratio_of_policy_probabilities))
        all_ratio_of_policy_probabilities = torch.clamp(
            input=all_ratio_of_policy_probabilities,
            min=-sys.maxsize,
            max=sys.maxsize)
        all_discounted_returns = torch.tensor(all_discounted_returns).to(
            all_ratio_of_policy_probabilities)
        potential_loss_value_1 = all_discounted_returns * all_ratio_of_policy_probabilities
        potential_loss_value_2 = all_discounted_returns * self.clamp_probability_ratio(
            all_ratio_of_policy_probabilities)
        loss = torch.min(potential_loss_value_1, potential_loss_value_2)
        loss = -torch.mean(loss)
        return loss

    def clamp_probability_ratio(self, value):
        """Clamps a value between a certain range determined by hyperparameter clip epsilon"""
        return torch.clamp(input=value,
                           min=1.0 - self.hyperparameters["clip_epsilon"],
                           max=1.0 + self.hyperparameters["clip_epsilon"])

    def take_policy_new_optimisation_step(self, loss):
        self.policy_new_optimizer.zero_grad()  # reset gradients to 0
        loss.backward()  # this calculates the gradients
        torch.nn.utils.clip_grad_norm_(
            self.policy_new.parameters(),
            self.hyperparameters["gradient_clipping_norm"]
        )  # clip gradients to help stabilise training
        self.policy_new_optimizer.step()  # this applies the gradients

    def equalise_policies(self):
        """Sets the old policy's parameters equal to the new policy's parameters"""
        for old_param, new_param in zip(self.policy_old.parameters(),
                                        self.policy_new.parameters()):
            old_param.data.copy_(new_param.data)

    def save_result(self):
        for ep in range(len(self.many_episode_rewards)):
            total_reward = np.sum(self.many_episode_rewards[ep])
            self.game_full_episode_scores.append(total_reward)
            self.rolling_results.append(
                np.mean(
                    self.game_full_episode_scores[-1 *
                                                  self.rolling_score_window:]))
        self.save_max_result_seen()