Ejemplo n.º 1
0
class MADDPG():

    actors = None
    actors_targets = None
    actors_optimisers = None
    critics = None
    critics_targets = None
    critics_optimisers = None
    env = None
    gamma = None

    def __init__(self,
                 env,
                 env_obs,
                 gamma=0.99,
                 tau=0.001,
                 lr_actor=1e-3,
                 lr_critic=1e-3,
                 weight_decay=0.1,
                 batch_size=64,
                 subpolicies=1,
                 action_shape=2,
                 replay_buffer_size=5000,
                 replay_buffer_type="rb",
                 noise=0.1,
                 noise_decay=0.999,
                 max_action=1,
                 min_action=-1,
                 teacher=False,
                 bc=None):

        self.env = env
        self.subpolicies = subpolicies
        self.total_obs = np.sum(env_obs)
        self.weight_decay = weight_decay
        self.max_action = max_action
        self.min_action = min_action
        self.action_shape = action_shape
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.replay_buffer_type = replay_buffer_type
        self.replay_buffer_size = replay_buffer_size
        self.init_noise = noise
        self.noise = noise
        self.noise_decay = noise_decay
        self.teacher = teacher

        self.mul = 1 if self.teacher is False else 2

        self.actors = [[
            Actor(self.mul * env_obs[agent], action_shape)
            for i in range(self.subpolicies)
        ] for agent in range(env.n)]
        self.actors_targets = [[
            Actor(self.mul * env_obs[agent], action_shape)
            for i in range(self.subpolicies)
        ] for agent in range(env.n)]
        self.critics = [
            Critic(self.mul * self.total_obs + action_shape * len(env.agents))
            for _ in env.agents
        ]
        self.critics_targets = [
            Critic(self.mul * self.total_obs + action_shape * len(env.agents))
            for _ in env.agents
        ]

        self.actors_optimizers = [[
            torch.optim.RMSprop(self.actors[agent][i].parameters(),
                                lr=lr_actor,
                                weight_decay=weight_decay)
            for i in range(self.subpolicies)
        ] for agent in range(len(env.agents))]
        self.critics_optimisers = [
            torch.optim.RMSprop(self.critics[agent].parameters(),
                                lr=lr_critic,
                                weight_decay=weight_decay)
            for agent in range(len(env.agents))
        ]

        if self.subpolicies > 1:
            if self.replay_buffer_type == "rb":
                self.replay_buffers = [[
                    ReplayBuffer(self.replay_buffer_size)
                    for _ in range(self.subpolicies)
                ] for _ in range(env.n)]
            else:
                self.replay_buffers = [[
                    PrioritizedReplayBuffer(self.replay_buffer_size)
                    for _ in range(self.subpolicies)
                ] for _ in range(env.n)]
        else:
            if self.replay_buffer_type == "rb":
                self.replay_buffers = ReplayBuffer(self.replay_buffer_size)
            else:
                self.replay_buffers = PrioritizedReplayBuffer(
                    self.replay_buffer_size)

    def save(self, path):
        for agent in range(self.env.n):
            for sub in range(self.subpolicies):
                torch.save(
                    self.actors[agent][sub].state_dict(),
                    path + '/actor_{}_subpolicy_{}.pt'.format(agent, sub))

    def load(self, path):
        for agent in range(self.env.n):
            for sub in range(self.subpolicies):
                self.actors[agent][sub].load_state_dict(
                    torch.load(path +
                               "/actor_{}_subpolicy_{}.pt".format(agent, sub)))

    def push_sample(self, s, a, r, d, s_t, subs):

        if self.replay_buffer_type == "rb":
            if self.subpolicies > 1:
                for agent in range(self.env.n):
                    self.replay_buffers[agent][subs[agent]].push(
                        s, a, r, d, s_t, subs)
            else:
                self.replay_buffers.push(s, a, r, d, s_t, subs)

        else:
            errors = self.get_errors(s, a, r, d, s_t, subs)

            if self.subpolicies > 1:
                for agent in range(self.env.n):
                    self.replay_buffers[agent][subs[agent]].push(
                        s, a, r, d, s_t, subs, errors[agent])
            else:
                self.replay_buffers.push(s, a, r, d, s_t, subs,
                                         np.mean(errors))

    def get_errors(self, s, a, r, d, s_t, subs):
        errors = []

        per_agent_obs = [s[agent] for agent in range(self.env.n)]

        t_agent_obs = [
            torch.Tensor(list(per_agent_obs[agent][:])).view(1, -1)
            for agent in range(self.env.n)
        ]

        obs = torch.cat(t_agent_obs, 1)

        per_agent_new_obs = [s_t[agent] for agent in range(self.env.n)]
        t_agent_new_obs = [
            torch.Tensor(list(per_agent_new_obs[agent][:])).view(1, -1)
            for agent in range(self.env.n)
        ]
        new_obs = torch.cat(t_agent_new_obs, 1)

        action = torch.Tensor(a)
        reward = torch.Tensor(r)

        new_actions = []

        for agent in range(self.env.n):
            current_new_action = self.actors_targets[agent][subs[agent]](
                t_agent_new_obs[agent]).clamp(self.min_action, self.max_action)
            new_actions.append(current_new_action)

        new_actions = torch.stack(new_actions, 1)

        for i in range(len(self.env.agents)):

            with torch.no_grad():

                target_Q_input = torch.cat((new_obs, new_actions.view(1, -1)),
                                           1)
                target_Q = reward[i] + (self.gamma * self.critics_targets[i]
                                        (target_Q_input) * (1 - d))
                current_Q_input = torch.cat((obs, action.view(1, -1)), 1)
                current_Q = self.critics[i](current_Q_input)

                critic_loss = F.mse_loss(current_Q, target_Q)
                errors.append(critic_loss.item())

        return errors

    def apply_noise_decay(self):
        self.noise = self.noise * self.noise_decay

    def reset_noise(self, noise_value=None):

        if noise_value is None:
            self.noise = self.init_noise

        else:
            self.noise = noise_value

    def random_act(self):
        return np.array([
            random.uniform(self.min_action, self.max_action)
            for _ in range(self.env.n * self.action_shape)
        ]).reshape(self.env.n, self.action_shape)

    def act(self, s, subs, noise=True):

        actions = []
        input = torch.Tensor(s)

        for agent in range(self.env.n):
            action = self.actors[agent][subs[agent]](input[agent])

            if noise is True:
                action = action + torch.FloatTensor(action.shape).uniform_(
                    -self.noise, self.noise)

            action = action.clamp(self.min_action, self.max_action)
            actions.append(action)

        return np.array([action.detach().numpy() for action in actions
                         ]).reshape(self.env.n, self.action_shape)

    def train(self, subs):

        for i in range(len(self.env.agents)):
            self.critics_optimisers[i].zero_grad()

            if self.subpolicies > 1:
                minibatch = self.replay_buffers[i][subs[i]].sample(
                    self.batch_size)
            else:
                minibatch = self.replay_buffers.sample(self.batch_size)

            if self.replay_buffer_type == "rb":
                obs, actions, rewards, dones, new_obs, _ = minibatch
            else:
                (batch, index, is_weight) = minibatch
                is_weight = torch.Tensor(is_weight)
                obs, actions, rewards, dones, new_obs, _ = batch
            """ Handle heterogeneous observation sizes by splitting agent observations """
            per_agent_obs = [obs[:, agent] for agent in range(self.env.n)]
            t_agent_obs = [
                torch.Tensor(list(per_agent_obs[agent][:]))
                for agent in range(self.env.n)
            ]
            obs = torch.cat(t_agent_obs, 1)

            per_agent_new_obs = [
                new_obs[:, agent] for agent in range(self.env.n)
            ]
            t_agent_new_obs = [
                torch.Tensor(list(per_agent_new_obs[agent][:]))
                for agent in range(self.env.n)
            ]
            new_obs = torch.cat(t_agent_new_obs, 1)

            actions = torch.Tensor(actions).clone()
            rewards = torch.Tensor(rewards)
            dones = torch.Tensor(dones).view(self.batch_size, -1)
            """ Compute a(st + 1) for each agent """
            new_actions = []

            for agent in range(self.env.n):
                current_new_action = self.actors_targets[agent][subs[agent]](
                    t_agent_new_obs[agent]).clamp(self.min_action,
                                                  self.max_action)
                new_actions.append(current_new_action)

            new_actions = torch.stack(new_actions, 1)

            target_Q_input = torch.cat(
                (new_obs, new_actions.view(self.batch_size, -1)), 1)
            current_Q_input = torch.cat(
                (obs, actions.view(self.batch_size, -1)), 1)

            with torch.no_grad():
                target_Q = rewards[:, i].view(self.batch_size, -1) + (
                    self.gamma * self.critics_targets[i](target_Q_input) *
                    (1 - dones))
                target_Q.detach()

            current_Q = self.critics[i](current_Q_input)

            if self.replay_buffer_type == "per":
                error = ((current_Q - target_Q)**2).reshape(
                    is_weight.shape) * is_weight
                error = error.detach().numpy()

                for sample_index in range(self.batch_size):
                    if self.subpolicies > 1:
                        self.replay_buffers[i][subs[i]].update(
                            index[sample_index], error[sample_index])
                    else:
                        self.replay_buffers.update(index[sample_index],
                                                   error[sample_index])

            critic_loss = F.mse_loss(current_Q, target_Q)

            critic_loss.backward()

            self.critics_optimisers[i].step()

            self.actors_optimizers[i][subs[i]].zero_grad()

            t_agent_obs[i].requires_grad = True
            actions[:, i] = self.actors[i][subs[i]](t_agent_obs[i])

            _input = torch.cat((obs.view(
                self.batch_size, -1), actions.view(self.batch_size, -1)), 1)

            actor_loss = -self.critics[i](_input).mean()

            actor_loss.backward()

            self.actors_optimizers[i][subs[i]].step()

            # Update the frozen target models
            for param, target_param in zip(
                    self.critics[i].parameters(),
                    self.critics_targets[i].parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(
                    self.actors[i][subs[i]].parameters(),
                    self.actors_targets[i][subs[i]].parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed, QNetwork):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            QNetwork: a class inheriting from torch.nn.Module that define the structure of the neural network
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        #when using a dropout the qnetwork_target should be put in eval mode
        self.qnetwork_target.eval()
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = PrioritizedReplayBuffer(seed, device, action_size, BUFFER_SIZE, BATCH_SIZE, DEFAULT_PRIORITY, PRIORITY_FACTOR)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.u_step = 0
        self.t_step = 0
        self.up_step = 0
        # To control the importance sampling weight. As the network is converging, b should move toward 1
        self.b = torch.tensor(1., device=device, requires_grad=False)
        self.b_decay = torch.tensor(0.00015, device=device, requires_grad=False)
    
    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()).astype(np.int_)
        else:
            return random.choice(np.arange(self.action_size)).astype(np.int_)
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        self.t_step = (self.t_step + 1) % TRANSFER_EVERY
        self.u_step = (self.u_step + 1) % UPDATE_EVERY
        self.up_step = (self.up_step + 1) % UPDATE_PRIORITY_EVERY
        
        # Learn from experiences
        if len(self.memory) > BATCH_SIZE and self.u_step == 0:
            # sample the experiences from the memory based on their priority
            experiences = self.memory.sample()
            self.learn(experiences)
        # Transfer the knowledge from the local network to the fixed on
        if len(self.memory) > BATCH_SIZE and self.t_step == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)
        # Update the priorities in the memory to alter the sampling
        # Ideally, this should be done before the sampling is taking place
        # But, for sake of performance, it might be better to recalculate them less often
        if len(self.memory) > 1 and self.up_step == 0:
            for experiences in self.memory.get_all_experiences(512):
                with torch.no_grad():
                    self.qnetwork_local.eval()
                    current_estimate, from_env = self.get_target_estimate(experiences)
                    # update the priorities based on newly learned errors
                    self.memory.update(experiences[-1], (from_env - current_estimate).squeeze())
            
    def get_target_estimate(self, experiences):
            states, actions, rewards, next_states, dones, probabilities, selected = experiences
            with torch.no_grad():
                best_actions = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1)
                evaluations = self.qnetwork_target(next_states).gather(1,best_actions)
                from_env = rewards + GAMMA*evaluations*(1 - dones)
            return self.qnetwork_local(states).gather(1, actions), from_env
        
    def learn(self, experiences):
        self.qnetwork_local.train()
        current_estimate,from_env = self.get_target_estimate(experiences)
        probabilities = experiences[-2]
        errors = (from_env - current_estimate)
        # Since the experiences were retrieved based on a given probabilities, such experience will biase the network
        # Therefore, we introduce here an importance sampling weight
        loss = (errors * errors / (len(self.memory) * probabilities) * self.b).mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.b = min(self.b + self.b_decay,1)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        θ_target = θ_target + τ*(θ_local - θ_target)

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)