Exemple #1
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, n_agents, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            n_agents (int): number of agents
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.n_agents = n_agents
        self.state_size = state_size
        self.action_size = action_size
        self.seed = np.random.seed(random_seed)
        random.seed(random_seed)

        # Actor Network (w/ Target Network)
        # self.actor_local = Actor(state_size, action_size, seed=random_seed, leak=LEAKINESS).to(device)
        # self.actor_target = Actor(state_size, action_size, seed=random_seed, leak=LEAKINESS).to(device)
        self.actor_local = Actor(state_size, action_size,
                                 seed=random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  seed=random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        # self.critic_local = Critic(state_size, action_size, seed=random_seed, leak=LEAKINESS).to(device)
        # self.critic_target = Critic(state_size, action_size, seed=random_seed, leak=LEAKINESS).to(device)
        self.critic_local = Critic(state_size, action_size,
                                   seed=random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    seed=random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)
        # self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC)

        # initialize targets same as original networks
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

        # Noise process
        # self.noise = OUNoise((n_agents, action_size), random_seed)
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)
        # self.memory.add(states, actions, rewards, next_states, dones)

        # Learn, if enough samples are available in memory
        self.t_step = self.t_step + 1

        if (len(self.memory) > BATCH_SIZE) and (self.t_step % UPDATE_EVERY
                                                == 0):
            # experiences = self.memory.sample()
            # self.learn(experiences, GAMMA)
            for _ in range(10):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)

        # Compute critic loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()

        torch.nn.utils.clip_grad_norm_(
            self.critic_local.parameters(),
            1)  # Added because I got converge problems
        # then I found in adaptationio's solution (https://github.com/adaptationio/DDPG-Continuous-Control), he added this.
        # Here to see the purpose of doing so: https://discuss.pytorch.org/t/about-torch-nn-utils-clip-grad-norm/13873
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(
            self.actor_local.parameters(),
            1)  # Added because I got converge problems
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, target, source):
        """
        Copy network parameters from source to target
        Inputs:
            target (torch.nn.Module): Net to copy parameters to
            source (torch.nn.Module): Net whose parameters to copy
        """
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Exemple #2
0
class MADDPGAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 n_agents,
                 current_agent,
                 random_seed=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            n_agents (int): number of agents
            current_agent (int): index of current agent
            random_seed (int): random seed
        """
        self.n_agents = n_agents
        self.current_agent = current_agent
        self.state_size = state_size
        self.action_size = action_size
        self.seed = np.random.seed(random_seed)
        random.seed(random_seed)

        # Actor Network (w/ Target Network)
        # self.actor_local = Actor(state_size, action_size, seed=random_seed, leak=LEAKINESS, use_bn=False).to(device)
        # self.actor_target = Actor(state_size, action_size, seed=random_seed, leak=LEAKINESS, use_bn=False).to(device)
        self.actor_local = Actor(state_size,
                                 action_size,
                                 seed=random_seed,
                                 use_bn=False).to(device)
        self.actor_target = Actor(state_size,
                                  action_size,
                                  seed=random_seed,
                                  use_bn=False).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        # self.critic_local = Critic(state_size * n_agents, action_size * n_agents,
        #                            seed=random_seed, leak=LEAKINESS, use_bn=False).to(device)
        # self.critic_target = Critic(self.state_size * n_agents, self.action_size * n_agents,
        #                             seed=random_seed, leak=LEAKINESS, use_bn=False).to(device)
        self.critic_local = Critic(state_size * n_agents,
                                   action_size * n_agents,
                                   seed=random_seed,
                                   use_bn=False).to(device)
        self.critic_target = Critic(self.state_size * n_agents,
                                    self.action_size * n_agents,
                                    seed=random_seed,
                                    use_bn=False).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)
        # self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)
        # self.noise = OUNoise((n_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        # Run inference in eval mode
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        # add noise if true
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, agents, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            agents (Agent class): all agents object
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = torch.zeros(
            (len(states), self.n_agents, self.action_size)).to(device)
        for i, agent in enumerate(agents):
            actions_next[:, i] = agent.actor_target(states[:, i, :])

        # print('\nactions_next:', actions_next.size(), '\nnext_states:', next_states.size())
        # # Flatten state and action
        # # e.g from state (100,2,24) --> (100, 48)
        critic_states = flatten(next_states)
        actions_next = flatten(actions_next)
        # print('after flatten:\nactions_next:', actions_next.size(), '\nnext_states:', critic_states.size())

        # calculate target and expected
        Q_targets_next = self.critic_target(critic_states, actions_next)
        # Q_targets_next = self.critic_target(next_states, actions_next)
        # print('Q_targets_next:', Q_targets_next.size())

        Q_targets = rewards[:, self.current_agent, :] + (
            gamma * Q_targets_next * (1 - dones[:, self.current_agent, :]))
        # print('Q_targets', Q_targets.size())
        # print('\nactions:', actions.size(), '\nstates:', states.size())

        # print('after flatten:\nactions:', flatten(actions).size(), '\nstates:', flatten(states).size())
        Q_expected = self.critic_local(flatten(states), flatten(actions))
        # Q_expected = self.critic_local(states, actions)

        # # Compute critic loss, use mse loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # critic_loss_value = critic_loss.item()
        self.critic_optimizer.zero_grad()
        critic_loss.backward()

        torch.nn.utils.clip_grad_norm_(
            self.critic_local.parameters(),
            1)  # Added because I got converge problems
        # then I found in adaptationio's solution (https://github.com/adaptationio/DDPG-Continuous-Control), he added this.
        # Here to see the purpose of doing so: https://discuss.pytorch.org/t/about-torch-nn-utils-clip-grad-norm/13873

        # for param in self.critic_local.parameters():
        #     param.grad.data.clamp_(-1, 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Update the predicted action of current agent
        actions_pred = torch.zeros(
            (len(states), self.n_agents, self.action_size)).to(device)
        actions_pred.data.copy_(actions.data)
        actions_pred[:, self.current_agent] = self.actor_local(
            states[:, self.current_agent])
        actor_loss = -self.critic_local(flatten(states),
                                        flatten(actions_pred)).mean()
        # actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(
            self.actor_local.parameters(),
            1)  # Added because I got converge problems
        # for param in self.critic_local.parameters():
        #     param.grad.data.clamp_(-1, 1)
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        if self.t_step == 0:
            # One time only, start local and target with same parameters
            self._copy_weights(self.critic_local, self.critic_target)
            self._copy_weights(self.actor_local, self.actor_target)
        else:
            self.soft_update(self.critic_local, self.critic_target, TAU)
            self.soft_update(self.actor_local, self.actor_target, TAU)

        self.t_step += 1

    def _copy_weights(self, source_network, target_network):
        """Copy source network weights to target"""
        for target_param, source_param in zip(target_network.parameters(),
                                              source_network.parameters()):
            target_param.data.copy_(source_param.data)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 n_agents,
                 state_size,
                 action_size,
                 random_seed,
                 prioritized_reply=False):
        """Initialize an Agent object.

        Params
        ======
            n_agents (int): number of agents
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            prioritized_reply (bool): True or False for using prioritized reply buffer, default is False
        """
        self.n_agents = n_agents
        self.state_size = state_size
        self.action_size = action_size
        self.seed = np.random.seed(random_seed)
        random.seed(random_seed)
        self.prioritized_reply = prioritized_reply

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size,
                                 action_size,
                                 seed=random_seed,
                                 leak=LEAKINESS).to(device)
        self.actor_target = Actor(state_size,
                                  action_size,
                                  seed=random_seed,
                                  leak=LEAKINESS).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size,
                                   action_size,
                                   seed=random_seed,
                                   leak=LEAKINESS).to(device)
        self.critic_target = Critic(state_size,
                                    action_size,
                                    seed=random_seed,
                                    leak=LEAKINESS).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        # Noise process
        self.noise = OUNoise((20, action_size), random_seed)

        # Replay memory
        if self.prioritized_reply:
            self.memory = PreReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                          random_seed, ALPHA)
            # Initialize learning step for updating beta
            self.learn_step = 0
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       random_seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.t_step = self.t_step + 1
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps
        # If enough samples are available in memory, get random subset and learn
        if (len(self.memory) > BATCH_SIZE) and (self.t_step % UPDATE_EVERY
                                                == 0):
            if self.prioritized_reply:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA, BETA)
            else:
                for _ in range(10):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, beta=None):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
            beta (float): reliance of importance sampling weight on prioritization
        """
        if self.prioritized_reply:
            # Beta will reach 1 after 25,000 training steps (~325 episodes)
            b = min(1.0, beta + self.learn_step * (1.0 - beta) / 25000)
            self.learn_step += 1
            states, actions, rewards, next_states, dones, probabilities, indices = experiences
        else:
            states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)

        if self.prioritized_reply:
            # Compute and update new priorities
            new_priorities = (abs(Q_expected - Q_targets) + 0.2).detach()
            self.memory.update_priority(new_priorities, indices)

            # Compute and apply importance sampling weights to TD Errors
            ISweights = (((1 / len(self.memory)) * (1 / probabilities))**b)
            max_ISweight = torch.max(ISweights)
            ISweights /= max_ISweight
            Q_targets *= ISweights
            Q_expected *= ISweights

        # Compute critic loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()

        torch.nn.utils.clip_grad_norm_(
            self.critic_local.parameters(),
            1)  # Added because I got converge problems
        # then I found in adaptationio's solution (https://github.com/adaptationio/DDPG-Continuous-Control), he added this.
        # Here to see the purpose of doing so: https://discuss.pytorch.org/t/about-torch-nn-utils-clip-grad-norm/13873
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)