class BQN(nn.Module):
    def __init__(self, state_space: int, action_num: int, action_scale: int,
                 learning_rate, device: str):
        super(BQN, self).__init__()

        self.q = QNetwork(state_space, action_num, action_scale).to(device)
        self.target_q = QNetwork(state_space, action_num,
                                 action_scale).to(device)
        self.target_q.load_state_dict(self.q.state_dict())

        self.optimizer = optim.Adam([\
                                    {'params' : self.q.linear_1.parameters(),'lr': learning_rate / (action_num+2)},\
                                    {'params' : self.q.linear_2.parameters(),'lr': learning_rate / (action_num+2)},\
                                    {'params' : self.q.value.parameters(), 'lr' : learning_rate/ (action_num+2)},\
                                    {'params' : self.q.actions.parameters(), 'lr' : learning_rate},\
                                    ])
        self.update_freq = 1000
        self.update_count = 0

    def action(self, x):
        return self.q(x)

    def train_mode(self, n_epi, memory, batch_size, gamma, use_tensorboard,
                   writer):
        state, actions, reward, next_state, done_mask = memory.sample(
            batch_size)
        actions = torch.stack(actions).transpose(0, 1).unsqueeze(-1)
        done_mask = torch.abs(done_mask - 1)

        cur_actions = self.q(state)
        cur_actions = torch.stack(cur_actions).transpose(0, 1)
        cur_actions = cur_actions.gather(2, actions.long()).squeeze(-1)

        target_cur_actions = self.target_q(next_state)
        target_cur_actions = torch.stack(target_cur_actions).transpose(0, 1)
        target_cur_actions = target_cur_actions.max(-1, keepdim=True)[0]
        target_action = (done_mask * gamma * target_cur_actions.mean(1) +
                         reward)

        loss = F.mse_loss(cur_actions, target_action.repeat(1, 4))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.update_count += 1
        if (self.update_count % self.update_freq
                == 0) and (self.update_count > 0):
            self.update_count = 0
            self.target_q.load_state_dict(self.q.state_dict())

        if use_tensorboard:
            writer.add_scalar("Loss/loss", loss, n_epi)
        return loss
Beispiel #2
0
class Brain:
    def __init__(self, params):
        self.num_actions = params['num_actions']
        self.device = params['device']
        self.batch_size = params['batch_size']
        self.learning_rate = params['learning_rate']
        self.gamma = params['gamma']
        self.eps_start = params['eps_start']
        self.eps_end = params['eps_end']
        self.eps_decay = params['eps_decay']
        self.policy_net = QNetwork(self.num_actions).to(self.device)
        self.target_net = QNetwork(self.num_actions).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.memory = ReplayMemory(params['replay_memory_size'])
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.learning_rate)
        self.steps_done = 0
        self.q_vals = [0] * self.num_actions
        self.loss = 0

    def decide_action(self, state):
        eps_threshold = self.eps_end + (
            self.eps_start - self.eps_end) * math.exp(
                -1. * self.steps_done / self.eps_decay)
        self.steps_done += 1
        with torch.no_grad():
            self.q_vals = self.policy_net(
                torch.from_numpy(state).float().to(self.device).unsqueeze(0))
        sample = random.random()
        if sample > eps_threshold:
            with torch.no_grad():
                return self.q_vals.max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.num_actions)]],
                                device=self.device,
                                dtype=torch.long)

    def optimize(self):
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat([
            torch.tensor(s, device=self.device, dtype=torch.float)
            for s in batch.next_state if s is not None
        ])

        state_batch = torch.cat(
            [torch.tensor(batch.state, device=self.device, dtype=torch.float)])
        action_batch = torch.cat(
            [torch.tensor(batch.action, device=self.device, dtype=torch.long)])
        reward_batch = torch.cat(
            [torch.tensor(batch.reward, device=self.device, dtype=torch.int)])

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch.unsqueeze(1))

        next_state_values = torch.zeros(self.batch_size, device=self.device)

        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states.unsqueeze(1)).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        self.loss = F.smooth_l1_loss(state_action_values,
                                     expected_state_action_values.unsqueeze(1))

        self.optimizer.zero_grad()
        self.loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())