Example #1
0
class Agent:
    def __init__(self, action_spec: dm_env.specs.DiscreteArray,
                 observation_spec: dm_env.specs.Array, device: torch.device,
                 settings: dict) -> None:
        """
        Initializes the agent,  constructs the qnet and the q_target, initializes the optimizer and ReplayMemory.
        Args:
            action_spec(dm_env.specs.DiscreteArray): description of the action space of the environment
            observation_spec(dm_env.specs.Array): description of observations form the environment
            device(str): "gpu" or "cpu"
            settings(dict): dictionary with settings
        """
        self.device = device
        action_size = action_spec.num_values
        state_size = np.prod(observation_spec.shape)
        self.action_size = action_size
        self.state_size = state_size
        self.batch_size = settings['batch_size']
        self.noisy_nets = settings['qnet_settings']['noisy_nets']
        self.distributional = settings["qnet_settings"]["distributional"]

        if self.distributional:
            # Currently the distributional agent always uses Dueling DQN
            self.qnet = DistributionalDuelDQN(state_size, action_size,
                                              settings['qnet_settings'],
                                              device).to(device)
            self.q_target = DistributionalDuelDQN(state_size, action_size,
                                                  settings['qnet_settings'],
                                                  device).to(device)
            vmin, vmax = settings["qnet_settings"]["vmin"], settings[
                "qnet_settings"]["vmax"]
            number_atoms = settings["qnet_settings"]["number_atoms"]
            self.distribution_updater = DistributionUpdater(
                vmin, vmax, number_atoms)
        else:
            if settings["duelling_dqn"]:
                self.qnet = DuelDQN(state_size, action_size,
                                    settings['qnet_settings']).to(device)
                self.q_target = DuelDQN(state_size, action_size,
                                        settings['qnet_settings']).to(device)
            else:
                self.qnet = Dqn(state_size, action_size,
                                settings['qnet_settings']).to(device)
                self.q_target = Dqn(state_size, action_size,
                                    settings['qnet_settings']).to(device)

        self.q_target.load_state_dict(self.qnet.state_dict())
        self.optimizer = optim.Adam(self.qnet.parameters(), lr=settings['lr'])

        self.epsilon = settings["epsilon_start"]
        self.decay = settings["epsilon_decay"]
        self.epsilon_min = settings["epsilon_min"]
        self.gamma = settings['gamma']

        self.start_optimization = settings["start_optimization"]
        self.update_qnet_every = settings["update_qnet_every"]
        self.update_target_every = settings["update_target_every"]
        self.number_steps = 0
        self.ddqn = settings["ddqn"]

        # Initialize replay memory
        self.prioritized_replay = settings["prioritized_buffer"]
        if self.prioritized_replay:
            self.memory = PrioritizedReplayMemory(
                device, settings["buffer_size"], self.gamma,
                settings["n_steps"], settings["alpha"], settings["beta0"],
                settings["beta_increment"])
        else:
            self.memory = ReplayMemory(device, settings["buffer_size"],
                                       self.gamma, settings["n_steps"])
        return

    def policy(self, timestep: dm_env.TimeStep) -> int:
        """
        Returns an action following an epsilon-greedy policy.
        Args:
            timestep(dm_env.TimeStep): An observation from the environment

        Returns:
            int: The chosen action.
        """
        observation = np.array(timestep.observation).flatten()
        observation = torch.from_numpy(observation).float().to(self.device)
        self.number_steps += 1

        if not self.noisy_nets:
            self.update_epsilon()

        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_size)
        else:
            return int(self.qnet.get_max_action(observation))

    def update_epsilon(self) -> None:
        """
        Decays epsilon until self.epsilon_min
        Returns:
            None
        """
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.decay

    @staticmethod
    def calc_loss(
            q_observed: torch.Tensor, q_target: torch.Tensor,
            weights: torch.Tensor) -> typing.Tuple[torch.Tensor, np.float64]:
        """
        Returns the mean weighted MSE loss and the loss for each sample
        Args:
            q_observed(torch.Tensor): calculated q_value
            q_target(torch.Tensor):   target q-value
            weights: weights of the batch samples

        Returns:
            tuple(torch.Tensor, np.float64): mean squared error loss, loss for each indivdual sample
        """
        losses = functional.mse_loss(q_observed, q_target, reduction='none')
        loss = (weights * losses).sum() / weights.sum()
        return loss, losses.cpu().detach().numpy() + 1e-8

    @staticmethod
    def calc_distributional_loss(
        dist: torch.Tensor,
        proj_dist: torch.Tensor,
        weights: torch.Tensor,
    ) -> typing.Tuple[torch.Tensor, np.float64]:
        """
        Calculates the distributional loss metric.
        Args:
            dist(torch.Tensor): The observed distribution
            proj_dist: The projected target distribution
            weights: weights of the batch samples

        Returns:
            tuple(torch.Tensor, np.float64): mean squared error loss, loss for each indivdual sample
        """
        losses = -functional.log_softmax(dist, dim=1) * proj_dist
        losses = weights * losses.sum(dim=1)
        return losses.mean(), losses.cpu().detach().numpy() + 1e-8

    def update(self, step: dm_env.TimeStep, action: int,
               next_step: dm_env.TimeStep) -> None:
        """
        Adds experience to the replay memory, performs an optimization_step and updates the q_target neural network.
        Args:
            step(dm_env.TimeStep): Current observation from the environment
            action(int): The action that was performed by the agent.
            next_step(dm_env.TimeStep): Next observation from the environment
        Returns:
            None
        """

        observation = np.array(step.observation).flatten()
        next_observation = np.array(next_step.observation).flatten()
        done = next_step.last()
        exp = Experience(observation, action, next_step.reward,
                         next_step.discount, next_observation, 0, done)
        self.memory.add(exp)

        if self.memory.number_samples() < self.start_optimization:
            return

        if self.number_steps % self.update_qnet_every == 0:
            s0, a0, n_step_reward, discount, s1, _, dones, indices, weights = self.memory.sample_batch(
                self.batch_size)
            if not self.distributional:
                self.optimization_step(s0, a0, n_step_reward, discount, s1,
                                       indices, weights)
            else:
                self.distributional_optimization_step(s0, a0, n_step_reward,
                                                      discount, s1, dones,
                                                      indices, weights)

        if self.number_steps % self.update_target_every == 0:
            self.q_target.load_state_dict(self.qnet.state_dict())
        return

    def optimization_step(self, s0: torch.Tensor, a0: torch.Tensor,
                          n_step_reward: torch.Tensor, discount: torch.Tensor,
                          s1: torch.Tensor,
                          indices: typing.Optional[torch.Tensor],
                          weights: typing.Optional[torch.Tensor]) -> None:
        """
        Calculates the Bellmann update and updates the qnet.
        Args:
            s0(torch.Tensor): current state
            a0(torch.Tensor): current action
            n_step_reward(torch.Tensor): n-step reward
            discount(torch.Tensor): discount factor
            s1(torch.Tensor): next state
            indices(torch.Tensor): batch indices, needed for prioritized replay. Not used yet.
            weights(torch.Tensor): weights needed for prioritized replay

        Returns:
            None
        """

        with torch.no_grad():
            if self.noisy_nets:
                self.q_target.reset_noise()
                self.qnet.reset_noise()

            # Calculating the target values
            next_q_vals = self.q_target(s1)
            if self.ddqn:
                a1 = torch.argmax(self.qnet(s1), dim=1).unsqueeze(-1)
                next_q_val = next_q_vals.gather(1, a1).squeeze()
            else:
                next_q_val = torch.max(next_q_vals, dim=1).values
            q_target = n_step_reward.squeeze(
            ) + self.gamma * discount.squeeze() * next_q_val

        # Getting the observed q-values
        if self.noisy_nets:
            self.qnet.reset_noise()
        q_observed = self.qnet(s0).gather(1, a0.long()).squeeze()

        # Calculating the losses
        if not self.prioritized_replay:
            weights = torch.ones(self.batch_size)
        critic_loss, batch_loss = self.calc_loss(q_observed, q_target, weights)

        # Backpropagation of the gradients
        self.optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.qnet.parameters(), 5)
        self.optimizer.step()

        # Update replay memory
        self.memory.update_priorities(indices, batch_loss)
        return

    def distributional_optimization_step(
            self, s0: torch.Tensor, a0: torch.Tensor,
            n_step_reward: torch.Tensor, discount: torch.Tensor,
            s1: torch.Tensor, dones: torch.Tensor,
            indices: typing.Optional[torch.Tensor],
            weights: typing.Optional[torch.Tensor]) -> None:
        """
        Calculates the Bellmann update and updates the qnet for the distributional agent.
        Args:
            s0(torch.Tensor): current state
            a0(torch.Tensor): current action
            n_step_reward(torch.Tensor): n-step reward
            discount(torch.Tensor): discount factor
            s1(torch.Tensor): next state
            dones(torch.Tensor): done
            indices(torch.Tensor): batch indices, needed for prioritized replay. Not used yet.
            weights(torch.Tensor): weights needed for prioritized replay

        Returns:
            None
        """

        with torch.no_grad():
            gamma = self.gamma * discount
            if self.noisy_nets:
                self.q_target.reset_noise()
                self.qnet.reset_noise()

            # Calculating the target distributions
            next_dists, next_q_vals = self.q_target.calc(s1)
            if self.ddqn:
                a1 = self.qnet.get_max_action(s1)
            else:
                a1 = torch.max(next_q_vals, dim=1)
            distributions = next_dists[range(self.batch_size), a1]
            distributions = functional.softmax(distributions, dim=1)
            q_target = self.distribution_updater.update_distribution(
                distributions.cpu().detach().numpy(),
                n_step_reward.cpu().detach().numpy(),
                dones.cpu().detach().numpy(),
                gamma.cpu().detach().numpy())
            q_target = torch.tensor(q_target).to(self.device)

        # Getting the observed q-value distributions
        if self.noisy_nets:
            self.qnet.reset_noise()
        q_observed = self.qnet(s0)
        q_observed = q_observed[range(self.batch_size), a0.squeeze().long()]

        # Calculating the losses
        if not self.prioritized_replay:
            weights = torch.ones(self.batch_size)
        critic_loss, batch_loss = self.calc_distributional_loss(
            q_observed, q_target, weights)

        # Backpropagation of the gradients
        self.optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.qnet.parameters(), 5)
        self.optimizer.step()

        # Update replay memory
        self.memory.update_priorities(indices, batch_loss)
        return
Example #2
0
class T3DAgent:
    def __init__(self, env, brain, brain_name, device, settings):
        self.env = env
        self.brain_name = brain_name
        self.device = device
        action_size = brain.vector_action_space_size
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        state_size = states.shape[1]
        self.action_size = action_size
        self.state_size = state_size
        self.batch_size = settings['batch_size']

        # Initialize actor local and target networks
        self.actor_local = Actor(state_size, action_size, settings['actor_settings']).to(device)
        self.actor_target = Actor(state_size, action_size, settings['actor_settings']).to(device)
        self.actor_target.load_state_dict(self.actor_local.state_dict())
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=settings['lr_actor'])

        # Initialize critic networks
        self.critic_local = Critic(state_size, action_size, settings['critic_settings']).to(device)
        self.critic_target = Critic(state_size, action_size, settings['critic_settings']).to(device)
        self.critic_target.load_state_dict(self.critic_local.state_dict())
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=settings['lr_critic'])

        # Save some of the settings into class member variables
        self.pretrain_steps = settings['pretrain_steps']
        self.gamma = settings['gamma']
        self.tau = settings['tau']

        self.action_noise = settings['action_noise']
        self.action_clip = settings['action_clip']
        self.target_action_noise = settings['target_action_noise']
        self.target_noise_clip = settings['target_noise_clip']
        self.optimize_every = settings['optimize_critic_every']

        # Initialize replay memory and episode generator
        self.memory = ReplayMemory(device, settings['buffer_size'])
        self.generator = self.play_episode()

        self.number_steps = 0
        return

    def get_action_noise(self):
        return self.action_noise

    def set_action_noise(self, std):
        self.action_noise = std
        return

    def pretrain(self):
        # The idea of using a pretrain phase before starting regular episodes
        # is from https://github.com/whiterabbitobj/Continuous_Control/
        print("Random sampling of " + str(self.pretrain_steps) + " steps")
        env = self.env
        brain_name = self.brain_name
        env_info = env.reset(train_mode=True)[brain_name]
        number_agents = env_info.vector_observations.shape[0]
        for _ in range(self.pretrain_steps):
            actions = []
            states = env_info.vector_observations
            for _ in range(number_agents):
                actions.append(np.random.uniform(-1, 1, self.action_size))
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                self.memory.add(Experience(state, action, reward, next_state, done))
            if np.any(dones):
                env_info = env.reset(train_mode=True)[brain_name]

    def play_episode(self, train_mode = True):
        # The idea of generating episodes in an "experience generator" is from
        # "Deep Reinforcement Learning Hands-On" by Maxim Lapan

        print("Starting episode generator")
        # Initialize the environment
        env = self.env
        brain_name = self.brain_name
        env_info = env.reset(train_mode=train_mode)[brain_name]
        # Initialize episode_rewards and get the first state
        episode_rewards = []
        # Run episode step by step
        while True:
            states = env_info.vector_observations
            with torch.no_grad():
                actions = self.actor_local.forward(
                    torch.from_numpy(states).type(torch.FloatTensor).to(self.device)).cpu().detach().numpy()
                actions += self.action_noise * np.random.normal(size=actions.shape)
                actions = np.clip(actions, -self.action_clip, self.action_clip)

            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            episode_rewards.append(rewards)

            for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                self.memory.add(Experience(state, action, reward, next_state, done))
            if np.any(dones):
                agent_reward = np.sum(episode_rewards, axis=0)
                std_reward = np.std(agent_reward)
                mean_reward = np.mean(agent_reward)
                episode_rewards = []
                env_info = env.reset(train_mode=True)[brain_name]
                yield mean_reward, std_reward
            else:
                yield -1, -1

    def take_step(self, train_mode = True):
        return next(self.generator, train_mode)

    def learn(self):
        self.number_steps += 1
        if self.memory.number_samples() <= self.batch_size:
            return
        # states, actions, rewards, next states, done
        s0, a0, r, s1, d = self.memory.sample_batch(self.batch_size)
        critic_loss_a, critic_loss_b = self.optimize_critic(s0, a0, r, s1, d)
        actor_loss = self.optimize_actor(s0)

        return actor_loss, critic_loss_a, critic_loss_b

    def optimize_actor(self, s0):
        # Calc policy loss
        if self.number_steps % self.optimize_every == 0:
            a0_pred = self.actor_local(s0)
            actor_loss = -self.critic_local.get_qa(s0, a0_pred).mean()
            # Update actor nn
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            # slow update
            self.slow_update(self.tau)
            return -actor_loss.cpu().detach().numpy()
        return 0

    def optimize_critic(self, s0, a0, r, s1, d):
        # The ideas of adding noise to the next state a1 as well as the critic loss, that takes q1_expected and
        # q2_expected as arguments at the same time, are from the implementation of the authors of the TD3 manuscript
        # at https://github.com/sfujim/TD3/
        with torch.no_grad():
            # calc critic loss
            noise = torch.randn_like(a0).to(self.device)
            noise = noise * torch.tensor(self.target_action_noise).expand_as(noise).to(self.device)
            noise = noise.clamp(-self.target_noise_clip, self.target_noise_clip)
            a1 = (self.actor_target(s1) + noise).clamp(-self.action_clip, self.action_clip)
            qa_target, qb_target = self.critic_target(s1, a1)
            q_target = torch.min(qa_target, qb_target)
            q_target = r + self.gamma * (1.0 - d) * q_target
        qa_expected, qb_expected = self.critic_local(s0, a0)
        critic_loss_a = functional.mse_loss(qa_expected, q_target)
        critic_loss_b = functional.mse_loss(qb_expected, q_target)
        critic_loss = critic_loss_a + critic_loss_b
        # Update critic nn
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
        return critic_loss_a.cpu().detach().numpy(), critic_loss_b.cpu().detach().numpy()

    def slow_update(self, tau):
        for target_par, local_par in zip(self.actor_target.parameters(), self.actor_local.parameters()):
            target_par.data.copy_(tau * local_par.data + (1.0 - tau) * target_par.data)
        for target_par, local_par in zip(self.critic_target.parameters(), self.critic_local.parameters()):
            target_par.data.copy_(tau * local_par.data + (1.0 - tau) * target_par.data)
        return

    def load_nets(self, actor_file_path, critic_file_path):
        self.actor_local.load_state_dict(torch.load(actor_file_path))
        self.actor_local.eval()
        self.critic_local.load_state_dict(torch.load(critic_file_path))
        self.critic_local.eval()
        return

    def save_nets(self, model_save_path):
        actor_path = model_save_path + "_actor_net.pt"
        torch.save(self.actor_local.state_dict(), actor_path)
        critic_path = model_save_path + "_critic_net.pt"
        torch.save(self.critic_local.state_dict(), critic_path)
        return