Esempio n. 1
0
class DDPG_single():
    def __init__(self,
                 state_dim,
                 action_dim,
                 max_action,
                 num_agents,
                 learning_rate,
                 discrete_action=True,
                 grid_per_action=20,
                 hidden_dim=32):
        self.max_action = max_action

        self.actor = Actor_DDPG(state_dim, action_dim, max_action, hidden_dim)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=learning_rate)

        self.critic = Critic_DDPG(state_dim, action_dim, num_agents,
                                  hidden_dim)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=learning_rate)
        self.exploration = OUNoise(action_dim)

        self.iter = 0

    def scale_noise(self, scale):
        self.exploration.scale = scale

    def reset_noise(self):
        self.exploration.reset()

    def select_action(self, obs, explore=False):
        self.actor.eval()
        action = self.actor(obs)
        self.actor.train()
        if explore:
            device = action.device
            action += torch.Tensor(self.exploration.noise()).to(device)
        action = action.clamp(-self.max_action, self.max_action)

        return action

    def get_params(self):
        return {
            'actor': self.actor.state_dict(),
            'actor_target': self.actor_target.state_dict(),
            'critic': self.critic.state_dict(),
            'critic_target': self.critic_target.state_dict(),
            'actor_optimizer': self.actor_optimizer.state_dict(),
            'critic_optimizer': self.critic_optimizer.state_dict()
        }

    def load_params(self, params):
        self.actor.load_state_dict(params['actor'])
        self.actor_target.load_state_dict(params['actor_target'])
        self.actor_optimizer.load_state_dict(params['actor_optimizer'])

        self.critic.load_state_dict(params['critic'])
        self.critic_target.load_state_dict(params['critic_target'])
        self.critic_optimizer.load_state_dict(params['critic_optimizer'])
class MADDPGAgent:
    """
    Defines a Multi-Agent Deep Deterministic Policy Gradient (MADDPG) agent
    """
    def __init__(self,
                 num_agents=2,
                 obs_size=24,
                 act_size=2,
                 gamma=0.99,
                 tau=1e-3,
                 lr_actor=1.0e-4,
                 lr_critic=1.0e-3,
                 weight_decay_actor=1e-5,
                 weight_decay_critic=1e-4,
                 clip_grad=1.0):
        super(MADDPGAgent, self).__init__()

        # Write parameters
        self.num_agents = num_agents
        self.gamma = gamma
        self.tau = tau
        self.clip_grad = clip_grad

        # Create all the networks
        self.actor = ActorNetwork(obs_size, act_size).to(device)
        self.critic = CriticNetwork(num_agents, obs_size, act_size).to(device)
        self.target_actor = ActorNetwork(obs_size, act_size).to(device)
        self.target_critic = CriticNetwork(num_agents, obs_size,
                                           act_size).to(device)

        # Copy initial network parameters to target networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # Initialize training optimizers and OU noise
        self.noise = OUNoise(act_size, scale=1.0)
        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=lr_actor,
                                    weight_decay=weight_decay_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=weight_decay_critic)

    def act(self, obs, noise=0.0):
        """ Act using the online actor network """
        obs = obs.to(device)
        action = self.actor(obs) + (noise * self.noise.noise()).to(device)
        action = torch.clamp(action, -1, 1)
        return action

    def target_act(self, obs, noise=0.0):
        """ Act using the target actor network (used for training) """
        obs = obs.to(device)
        action = self.target_actor(obs) + (noise *
                                           self.noise.noise()).to(device)
        action = torch.clamp(action, -1, 1)
        return action

    def update_targets(self):
        """
        Perform soft update of target network parameters based on latest actor/critic parameters
        """
        soft_update(self.target_critic, self.critic, self.tau)
        soft_update(self.target_actor, self.actor, self.tau)

    def train(self, samples):
        """
        Perform a training step for critic and actor networks with soft update
        """

        # Unpack data from replay buffer and convert to tensors
        obs = torch.tensor([exp[0] for exp in samples],
                           dtype=torch.float,
                           device=device)
        act = torch.tensor([exp[1] for exp in samples],
                           dtype=torch.float,
                           device=device)
        reward = torch.tensor([exp[2] for exp in samples],
                              dtype=torch.float,
                              device=device)
        next_obs = torch.tensor([exp[3] for exp in samples],
                                dtype=torch.float,
                                device=device)
        done = torch.tensor([exp[4] for exp in samples],
                            dtype=torch.float,
                            device=device)
        obs_full = torch.tensor([exp[5] for exp in samples],
                                dtype=torch.float,
                                device=device)
        next_obs_full = torch.tensor([exp[6] for exp in samples],
                                     dtype=torch.float,
                                     device=device)
        act_full = torch.tensor([exp[7] for exp in samples],
                                dtype=torch.float,
                                device=device)

        # Critic update
        self.critic_optimizer.zero_grad()
        target_critic_obs = [next_obs_full[:,i,:].squeeze() \
                        for i in range(self.num_agents)]
        target_critic_obs = torch.cat(target_critic_obs, dim=1)
        target_act = [self.target_act(next_obs_full[:,i,:].squeeze()) \
                        for i in range(self.num_agents)]
        target_act = torch.cat(target_act, dim=1)
        with torch.no_grad():
            q_next = self.target_critic(target_critic_obs, target_act)
        q_target = reward + self.gamma * q_next * (1 - done)

        critic_obs = [obs_full[:,i,:].squeeze() \
                        for i in range(self.num_agents)]
        critic_obs = torch.cat(critic_obs, dim=1)
        critic_act = [act_full[:,i,:].squeeze() \
                        for i in range(self.num_agents)]
        critic_act = torch.cat(critic_act, dim=1)
        q = self.critic(critic_obs, critic_act)

        critic_loss = torch.nn.functional.mse_loss(q, q_target.detach())
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(),
                                       self.clip_grad)
        self.critic_optimizer.step()

        # Actor update using policy gradient
        self.actor_optimizer.zero_grad()
        actor_act = [self.act(obs_full[:,i,:].squeeze()) \
                     for i in range(self.num_agents)]
        actor_act = torch.cat(actor_act, dim=1)
        actor_loss = -self.critic(critic_obs, actor_act).mean()
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.clip_grad)
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.update_targets()
Esempio n. 3
0
class DDPGAgent:
    def __init__(self,
                 in_actor,
                 hidden_in_actor,
                 hidden_out_actor,
                 out_actor,
                 in_critic,
                 hidden_in_critic,
                 hidden_out_critic,
                 lr_actor=1.0e-3,
                 lr_critic=1.0e-3,
                 noise_dist: str = 'normal',
                 checkpoint_path=None) -> None:
        super(DDPGAgent, self).__init__()

        self.actor = Network(in_actor,
                             hidden_in_actor,
                             hidden_out_actor,
                             out_actor,
                             actor=True).to(device)
        self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic,
                              1).to(device)
        self.target_actor = Network(in_actor,
                                    hidden_in_actor,
                                    hidden_out_actor,
                                    out_actor,
                                    actor=True).to(device)
        self.target_critic = Network(in_critic, hidden_in_critic,
                                     hidden_out_critic, 1).to(device)

        self.noise = OUNoise(out_actor, scale=1.0, noise_dist=noise_dist)
        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=1.e-5)
        if checkpoint_path:
            checkpoint = torch.load(checkpoint_path)
            self.actor.load_state_dict(checkpoint[0]['actor_params'])
            self.target_actor.load_state_dict(checkpoint[0]['actor_params'])
            self.critic.load_state_dict(checkpoint[0]['critic_params'])
            self.target_critic.load_state_dict(checkpoint[0]['critic_params'])

    def act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.actor(obs) + noise * self.noise.noise()
        return action

    def target_act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.target_actor(obs) + noise * self.noise.noise()
        return action

    def update(self,
               buffer: ReplayBuffer,
               batchsize: int = 1000,
               tau: float = 0.005,
               discount: float = 0.98):

        states, actions, rewards, states_next, dones = buffer.sample(
            batchsize=batchsize)

        actions_next = self.target_actor(torch.stack(states_next).float())
        input_target_critic = torch.cat(
            [torch.stack(states_next).float(),
             actions_next.float()], axis=1)
        state_value = self.target_critic(input_target_critic)
        state_value.add_(torch.tensor(rewards).unsqueeze(1))
        state_value = state_value * discount * (1 -
                                                torch.tensor(dones).float())
        state_value.detach()

        input_critic = torch.cat(
            [torch.stack(states).float(),
             torch.stack(actions).float()],
            axis=1)
        state_value_local = self.critic(input_critic)

        critic_loss = (state_value -
                       state_value_local).pow(2).mul(0.5).sum(-1).mean()
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # update actor
        actions_new = self.actor(torch.stack(states).float())
        value_critic = self.critic(
            torch.cat([torch.stack(states).float(), actions_new], axis=1))
        loss_actor = -value_critic.mean()

        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()
        soft_update(self.target_actor, self.actor, tau)
        soft_update(self.target_critic, self.critic, tau)

    def update_targets(self, tau=0.005):
        soft_update(self.target_actor, self.actor, tau)
        soft_update(self.target_critic, self.critic, tau)