Exemple #1
0
    def __init__(self,
                 observation_space,
                 action_space,
                 lr=7e-4,
                 gamma=0.99,
                 tau=0.01):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.gamma = gamma
        self.memory = NumpyReplay(100000, observation_space.shape[0],
                                  self.device)
        self.action_space = action_space

        self.epsilon = 0.5
        self.epsilon_decay = 0.9995
        self.min_epsilon = 0.02

        self.tau = tau
        self.dqn = Network(observation_space.shape[0],
                           action_space.n).to(self.device)
        self.dqn_target = Network(observation_space.shape[0],
                                  action_space.n).to(self.device)
        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.dqn_target.eval()

        self.optimizer = optim.Adam(self.dqn.parameters(), lr=lr)
Exemple #2
0
    def __init__(self,
                 env_num,
                 observation_space,
                 action_space,
                 lr=7e-4,
                 steps=64,
                 gamma=0.99,
                 lam=0.95,
                 entropy_coef=0.005,
                 clip=0.1):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.gamma = gamma
        self.lam = lam
        self.entropy_coef = entropy_coef
        self.clip = clip
        self.steps = steps

        self.memory = NumpyReplay(steps, env_num, observation_space.shape[0],
                                  self.device)

        self.actorcritic = ActorCritic(observation_space.shape[0],
                                       action_space.n).to(self.device)
        self.actorcritic_optimizer = optim.Adam(self.actorcritic.parameters(),
                                                lr=lr,
                                                eps=1e-6)
        self.target_actorcritic = ActorCritic(observation_space.shape[0],
                                              action_space.n).to(self.device)
        self.target_actorcritic.load_state_dict(self.actorcritic.state_dict())
Exemple #3
0
    def __init__(self,
                 observation_space,
                 action_space,
                 alpha=0.2,
                 gamma=0.99,
                 tau=0.01,
                 p_lr=7e-4,
                 q_lr=7e-4,
                 a_lr=3e-4,
                 policy_freq=1):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.state_shape = observation_space.shape[0]
        self.action_shape = action_space.shape[0]
        self.action_range = [action_space.low, action_space.high]

        self.alpha = alpha
        self.gamma = gamma
        self.tau = tau
        self.memory = NumpyReplay(1000000, observation_space.shape[0],
                                  self.device)
        self.count = 0
        self.policy_freq = policy_freq

        self.actor = Actor(self.state_shape, self.action_shape).to(self.device)

        self.critic1 = Critic(self.state_shape,
                              self.action_shape).to(self.device)
        self.target_critic1 = Critic(self.state_shape,
                                     self.action_shape).to(self.device)
        self.critic2 = Critic(self.state_shape,
                              self.action_shape).to(self.device)
        self.target_critic2 = Critic(self.state_shape,
                                     self.action_shape).to(self.device)

        for target_param, param in zip(self.target_critic1.parameters(),
                                       self.critic1.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.target_critic2.parameters(),
                                       self.critic2.parameters()):
            target_param.data.copy_(param.data)

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=p_lr)
        self.critic_optimizer1 = optim.Adam(self.critic1.parameters(), lr=q_lr)
        self.critic_optimizer2 = optim.Adam(self.critic2.parameters(), lr=q_lr)

        self.target_entropy = -torch.prod(
            torch.Tensor(self.action_shape).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr)
Exemple #4
0
class DDQN:
    def __init__(self,
                 observation_space,
                 action_space,
                 lr=7e-4,
                 gamma=0.99,
                 tau=0.01):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.gamma = gamma
        self.memory = NumpyReplay(100000, observation_space.shape[0],
                                  self.device)
        self.action_space = action_space

        self.epsilon = 0.5
        self.epsilon_decay = 0.9995
        self.min_epsilon = 0.02

        self.tau = tau
        self.dqn = Network(observation_space.shape[0],
                           action_space.n).to(self.device)
        self.dqn_target = Network(observation_space.shape[0],
                                  action_space.n).to(self.device)
        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.dqn_target.eval()

        self.optimizer = optim.Adam(self.dqn.parameters(), lr=lr)

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon, self.min_epsilon)

        if np.random.random() < self.epsilon:
            action = [self.action_space.sample() for i in range(len(state))]
            return action

        state = torch.FloatTensor(state).to(self.device)
        action = self.dqn.forward(state).argmax(dim=-1)
        action = action.cpu().detach().numpy()

        return action

    def remember(self, state, action, reward, new_state, done):
        for i in range(len(state)):
            self.memory.update(state[i], action[i], reward[i], new_state[i],
                               done[i])

    def train(self, batch_size=128, epochs=1):
        if batch_size > self.memory.size:
            return

        for epoch in range(epochs):
            (states, actions, rewards, next_states,
             dones) = self.memory.sample(batch_size)

            actions = actions.unsqueeze(-1)
            rewards = rewards.unsqueeze(-1)
            dones = dones.unsqueeze(-1)

            q = self.dqn.forward(states).gather(-1, actions.long())

            with torch.no_grad():
                a2 = self.dqn.forward(next_states).argmax(dim=-1, keepdim=True)
                q2 = self.dqn_target.forward(next_states).gather(-1, a2)

                target = (rewards + (1 - dones) * self.gamma * q2)

            loss = F.mse_loss(q, target)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            self.update_target()

    def update_target(self):
        with torch.no_grad():
            for target_param, param in zip(self.dqn_target.parameters(),
                                           self.dqn.parameters()):
                target_param.data.mul_(1 - self.tau)
                torch.add(target_param.data,
                          param.data,
                          alpha=self.tau,
                          out=target_param.data)
Exemple #5
0
class SAC:
    def __init__(self,
                 observation_space,
                 action_space,
                 alpha=0.2,
                 gamma=0.99,
                 tau=0.01,
                 p_lr=7e-4,
                 q_lr=7e-4,
                 a_lr=3e-4,
                 policy_freq=1):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.state_shape = observation_space.shape[0]
        self.action_shape = action_space.shape[0]
        self.action_range = [action_space.low, action_space.high]

        self.alpha = alpha
        self.gamma = gamma
        self.tau = tau
        self.memory = NumpyReplay(1000000, observation_space.shape[0],
                                  self.device)
        self.count = 0
        self.policy_freq = policy_freq

        self.actor = Actor(self.state_shape, self.action_shape).to(self.device)

        self.critic1 = Critic(self.state_shape,
                              self.action_shape).to(self.device)
        self.target_critic1 = Critic(self.state_shape,
                                     self.action_shape).to(self.device)
        self.critic2 = Critic(self.state_shape,
                              self.action_shape).to(self.device)
        self.target_critic2 = Critic(self.state_shape,
                                     self.action_shape).to(self.device)

        for target_param, param in zip(self.target_critic1.parameters(),
                                       self.critic1.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.target_critic2.parameters(),
                                       self.critic2.parameters()):
            target_param.data.copy_(param.data)

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=p_lr)
        self.critic_optimizer1 = optim.Adam(self.critic1.parameters(), lr=q_lr)
        self.critic_optimizer2 = optim.Adam(self.critic2.parameters(), lr=q_lr)

        self.target_entropy = -torch.prod(
            torch.Tensor(self.action_shape).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr)

    def act(self, state, noise=True):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        action, _ = self.actor.sample(state)
        action = action.cpu().detach().squeeze(0).numpy()

        return self.rescale_action(action)

    def rescale_action(self, action):
        return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\
            (self.action_range[1] + self.action_range[0]) / 2.0

    def remember(self, state, action, reward, new_state, done):
        for i in range(len(state)):
            self.memory.update(state[i], action[i], reward[i], new_state[i],
                               done[i])

    def train(self, batch_size=64):
        if batch_size > self.memory.size:
            return

        self.count += 1

        (states, actions, rewards, next_states,
         dones) = self.memory.sample(batch_size)

        actions = actions.unsqueeze(-1)
        rewards = rewards.unsqueeze(-1)
        dones = dones.unsqueeze(-1)

        self._train_critics(states, actions, rewards, next_states, dones)
        if self.count % self.policy_freq == 0:
            self._train_actor(states, actions, rewards, next_states, dones)
            self.update_target()

    def _train_critics(self, states, actions, rewards, next_states, dones):
        next_actions, next_log_pi = self.actor.sample(next_states)

        with torch.no_grad():
            target_Q1 = self.target_critic1.forward(next_states, next_actions)
            target_Q2 = self.target_critic2.forward(next_states, next_actions)
            target_Q = torch.min(target_Q1,
                                 target_Q2) - self.alpha * next_log_pi
            target_Q = rewards + ((1 - dones) * self.gamma * target_Q)

        current_Q1 = self.critic1(states, actions)
        loss_Q1 = F.mse_loss(current_Q1, target_Q)
        self.critic_optimizer1.zero_grad()
        loss_Q1.backward()
        self.critic_optimizer1.step()

        current_Q2 = self.critic2(states, actions)
        loss_Q2 = F.mse_loss(current_Q2, target_Q)
        self.critic_optimizer2.zero_grad()
        loss_Q2.backward()
        self.critic_optimizer2.step()

    def _train_actor(self, states, actions, rewards, next_states, dones):
        new_actions, log_pi = self.actor.sample(states)
        min_q = torch.min(self.critic1.forward(states, new_actions),
                          self.critic2.forward(states, new_actions))

        policy_loss = (self.alpha * log_pi - min_q).mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        alpha_loss = (self.log_alpha *
                      (-log_pi - self.target_entropy).detach()).mean()

        self.alpha_optim.zero_grad()
        alpha_loss.backward()
        self.alpha_optim.step()
        self.alpha = self.log_alpha.exp()

    def update_target(self):
        with torch.no_grad():
            for target_param, param in zip(self.target_critic1.parameters(),
                                           self.critic1.parameters()):
                target_param.data.mul_(1 - self.tau)
                torch.add(target_param.data,
                          param.data,
                          alpha=self.tau,
                          out=target_param.data)

            for target_param, param in zip(self.target_critic2.parameters(),
                                           self.critic2.parameters()):
                target_param.data.mul_(1 - self.tau)
                torch.add(target_param.data,
                          param.data,
                          alpha=self.tau,
                          out=target_param.data)
Exemple #6
0
class SharedPPO:
    def __init__(self,
                 env_num,
                 observation_space,
                 action_space,
                 lr=7e-4,
                 steps=64,
                 gamma=0.99,
                 lam=0.95,
                 entropy_coef=0.005,
                 clip=0.1):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.gamma = gamma
        self.lam = lam
        self.entropy_coef = entropy_coef
        self.clip = clip
        self.steps = steps

        self.memory = NumpyReplay(steps, env_num, observation_space.shape[0],
                                  self.device)

        self.actorcritic = ActorCritic(observation_space.shape[0],
                                       action_space.n).to(self.device)
        self.actorcritic_optimizer = optim.Adam(self.actorcritic.parameters(),
                                                lr=lr,
                                                eps=1e-6)
        self.target_actorcritic = ActorCritic(observation_space.shape[0],
                                              action_space.n).to(self.device)
        self.target_actorcritic.load_state_dict(self.actorcritic.state_dict())

    def act(self, state):
        state = torch.FloatTensor(state).to(self.device)
        probs, _ = self.target_actorcritic.forward(state)
        action = probs.sample()
        return action.cpu().detach().numpy()

    def remember(self, state, action, reward, new_state, done):
        state_torch = torch.FloatTensor(state).to(self.device)
        probs, _ = self.actorcritic.forward(state_torch)
        action_torch = torch.LongTensor(action).to(self.device)
        log_probs = probs.log_prob(action_torch)
        self.memory.update(state, action, log_probs, reward, new_state, done)

    def compute_gae(self, values, dones, rewards):
        returns = torch.zeros_like(rewards).to(self.device)
        advantages = torch.zeros_like(rewards).to(self.device)
        deltas = torch.zeros_like(rewards).to(self.device)

        returns[-1] = rewards[-1] + self.gamma * (1 - dones[-1]) * rewards[-1]
        advantages[-1] = returns[-1] - values[-1]

        for i in reversed(range(len(rewards) - 1)):
            delta = rewards[i] + self.gamma * (
                1 - dones[i]) * values[i + 1] - values[i]
            advantages[i] = delta + self.gamma * self.lam * (
                1 - dones[i]) * advantages[i + 1]
            returns[i] = advantages[i] + values[i]

        return returns, (advantages - advantages.mean()) / (advantages.std() +
                                                            1e-10)

    def compute_loss(self, states, actions, logp, advantages, returns):
        new_probs, v = self.actorcritic.forward(states)

        new_logprobs = new_probs.log_prob(actions)
        entropy = new_probs.entropy().mean()
        ratios = torch.exp(
            new_logprobs.unsqueeze(-1) - logp.unsqueeze(-1).detach())

        surr1 = ratios * advantages.detach()
        surr2 = torch.clamp(ratios, 1 - self.clip,
                            1 + self.clip) * advantages.detach()

        policy_loss = -torch.min(surr1, surr2).mean()
        value_loss = 0.5 * F.mse_loss(v, returns.detach())
        entropy_loss = -self.entropy_coef * entropy

        return policy_loss, value_loss, entropy_loss

    def train(self, epochs=8):
        if self.memory.length < self.steps:
            return

        states, actions, log_probs, rewards, next_states, dones = self.memory.sample(
        )

        rewards = rewards.unsqueeze(-1)
        dones = dones.unsqueeze(-1)
        log_probs = log_probs.detach()

        _, v = self.actorcritic.forward(states)
        returns, advantages = self.compute_gae(v, dones, rewards)

        for _ in range(epochs):
            self.actorcritic_optimizer.zero_grad()

            policy_loss, value_loss, entropy_loss = self.compute_loss(
                states, actions, log_probs, advantages, returns)
            total_loss = policy_loss + value_loss + entropy_loss

            total_loss.backward()
            self.actorcritic_optimizer.step()

        self.target_actorcritic.load_state_dict(self.actorcritic.state_dict())