Example #1
0
class Learner:
    def __init__(self, opt, q_batch):
        self.opt = opt
        self.q_batch = q_batch

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = gym.make(self.opt.env)
        self.env.seed(self.opt.seed)
        self.n_state = self.env.observation_space.shape[0]
        self.n_act = self.env.action_space.n

        self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device)
        self.critic = CriticNetwork(self.n_state).to(self.device)
        self.actor.share_memory()
        self.critic.share_memory()
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=opt.lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=opt.lr)

    def learning(self):
        torch.manual_seed(self.opt.seed)

        while True:
            # batch-trace
            states, actions, rewards = self.q_batch.get(block=True)

            onehot_actions = torch.FloatTensor(
                index2onehot(actions, self.n_act)).to(self.device)

            # update actor network
            self.actor_optimizer.zero_grad()
            action_log_probs = self.actor(states)
            action_log_probs = torch.sum(action_log_probs * onehot_actions, 1)
            values = self.critic(states)
            advantages = rewards - values.detach()
            pg_loss = -torch.sum(action_log_probs * advantages)
            actor_loss = pg_loss
            actor_loss.backward()
            self.actor_optimizer.step()

            # update critic network
            self.critic_optimizer.zero_grad()
            target_values = rewards
            critic_loss = nn.MSELoss()(values, target_values)
            critic_loss.backward()
            self.critic_optimizer.step()
Example #2
0
class Learner(object):
    def __init__(self, opt, q_batch):
        self.opt = opt
        self.q_batch = q_batch

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.env = gym.make(self.opt.env)
        self.env.seed(self.opt.seed)
        self.n_state = self.env.observation_space.shape[0]
        self.n_act = self.env.action_space.n

        self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device)
        self.critic = CriticNetwork(self.n_state).to(self.device)
        self.actor.share_memory()
        self.critic.share_memory()
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=opt.lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=opt.lr)

    def learning(self):
        torch.manual_seed(self.opt.seed)
        coef_hat = torch.FloatTensor([self.opt.coef_hat]*self.opt.batch_size*self.opt.n_step).view(self.opt.batch_size, self.opt.n_step)
        rho_hat = torch.FloatTensor([self.opt.rho_hat]*self.opt.batch_size*self.opt.n_step).view(self.opt.batch_size, self.opt.n_step)
        while True:
            # batch-trace
            states, actions, rewards, dones, action_log_probs = self.q_batch.get(block=True)

            logit_log_probs = self.actor(states)
            V = self.critic(states).view(self.opt.batch_size, self.opt.n_step) * (1 - dones)

            action_probs = torch.exp(action_log_probs)
            logit_probs = torch.exp(logit_log_probs)

            is_rate = torch.prod(logit_probs / (action_probs + 1e-6), dim=-1).detach()
            coef = torch.min(coef_hat, is_rate) * (1 - dones)
            rho = torch.min(rho_hat, is_rate) * (1 - dones)

            # V-trace
            v_trace = torch.zeros((self.opt.batch_size, self.opt.n_step)).to(self.device)
            target_V = V.detach()
            for rev_step in reversed(range(states.size(1) - 1)):
                v_trace[:, rev_step] = target_V[:, rev_step] \
                                       + rho[:, rev_step] * (rewards[:, rev_step] + self.opt.gamma*target_V[:, rev_step+1] - target_V[:, rev_step]) \
                                       + self.opt.gamma * coef[:, rev_step] * (v_trace[:, rev_step+1] - target_V[:, rev_step+1])

            # actor loss
            onehot_actions = torch.FloatTensor(
                idx2onehot(actions.cpu().numpy(), self.opt.batch_size, self.n_act)).to(self.device)
            logit_log_probs = torch.sum(logit_log_probs * onehot_actions, dim=-1)
            advantages = rewards + self.opt.gamma * v_trace - V
            pg_loss = -torch.sum(logit_log_probs * advantages.detach())
            actor_loss = pg_loss

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # critic
            critic_loss = torch.mean((v_trace.detach() - V)**2)

            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()