예제 #1
0
파일: trainDQN.py 프로젝트: huestisk/chess
class DQN(Trainer):

    def __init__(self, parameters):
        super(DQN, self).__init__(parameters)
        self.replay_buffer = ReplayBuffer(self.buffersize)

    def push_to_buffer(self, state, action, reward, next_state, done):
        self.replay_buffer.push(state, action, reward, next_state, done)

    def compute_td_loss(self, batch_size, *args):
        state, action, reward, next_state, done = self.replay_buffer.sample(
            batch_size)

        state = Variable(torch.FloatTensor(np.float32(state)))
        next_state = Variable(torch.FloatTensor(np.float32(next_state)))
        action = Variable(torch.LongTensor(action))
        reward = Variable(torch.FloatTensor(reward))
        done = Variable(torch.FloatTensor(done))

        q_values = self.current_model(state)
        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)

        next_q_values = self.current_model(next_state)
        next_q_state_values = self.target_model(next_state)
        next_q_value = next_q_state_values.gather(
            1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)

        expected_q_value = reward + self.gamma * next_q_value * (1 - done)

        loss = (q_value - Variable(expected_q_value.data)).abs()
        loss[loss.le(1)] = loss[loss.le(1)].pow(2)
        loss[loss.gt(1)] = 1 #(loss[loss.gt(1)] + 1) / 2
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss
예제 #2
0
    writer.flush()

iterations = 1000000
batch_size = 32
gamma      = 0.98

losses = []
all_rewards = []
episode_reward = 0

state = env.reset()
for iteration in range(1, iterations + 1):
    action = current_model.act(state, epsilon_by_frame(iteration))
    
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    
    state = next_state
    episode_reward += reward
    
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0
        
    if len(replay_buffer) > batch_size:
        loss = compute_td_loss(batch_size)
        losses.append(loss.item())
        
    if iteration % 200 == 0:
        plot(iteration, all_rewards, losses, episode_reward)
예제 #3
0
class Rainbow(Trainer):
    def __init__(self, parameters):
        super(Rainbow, self).__init__(parameters)
        self.replay_buffer = ReplayBuffer(self.buffersize)

    def push_to_buffer(self, state, action, reward, next_state, done):
        self.replay_buffer.push(state, action, reward, next_state, done)

    def load_model(self):
        self.current_model = RainbowDQN(self.env.observation_space.shape[0],
                                        self.env.action_space.n, num_atoms,
                                        Vmin,
                                        Vmax)  # input:(1,84,84), output:6
        self.target_model = RainbowDQN(self.env.observation_space.shape[0],
                                       self.env.action_space.n, num_atoms,
                                       Vmin, Vmax)

        if USE_CUDA:
            self.current_model = self.current_model.cuda()
            self.target_model = self.target_model.cuda()

        self.update_target(self.current_model, self.target_model)  # sync nets

    def projection_distribution(self, next_state, rewards, dones):
        batch_size = next_state.size(0)

        delta_z = float(Vmax - Vmin) / (num_atoms - 1)
        support = torch.linspace(Vmin, Vmax, num_atoms)

        next_dist = self.target_model(next_state).data.cpu() * support
        next_action = next_dist.sum(2).max(1)[1]
        next_action = next_action.unsqueeze(1).unsqueeze(1).expand(
            next_dist.size(0), 1, next_dist.size(2))
        next_dist = next_dist.gather(1, next_action).squeeze(1)

        rewards = rewards.unsqueeze(1).expand_as(next_dist)
        dones = dones.unsqueeze(1).expand_as(next_dist)
        support = support.unsqueeze(0).expand_as(next_dist)

        Tz = rewards + (1 - dones) * 0.99 * support
        Tz = Tz.clamp(min=Vmin, max=Vmax)
        b = (Tz - Vmin) / delta_z
        l = b.floor().long()
        u = b.ceil().long()

        offset = torch.linspace(0, (batch_size - 1) * num_atoms, batch_size).long()\
            .unsqueeze(1).expand(batch_size, num_atoms)

        proj_dist = torch.zeros(next_dist.size())
        proj_dist.view(-1).index_add_(0, (l + offset).view(-1),
                                      (next_dist * (u.float() - b)).view(-1))
        proj_dist.view(-1).index_add_(0, (u + offset).view(-1),
                                      (next_dist * (b - l.float())).view(-1))

        return proj_dist

    def compute_td_loss(self, batch_size, *args):
        state, action, reward, next_state, done = self.replay_buffer.sample(
            batch_size)

        state = Variable(torch.FloatTensor(np.float32(state)))
        next_state = Variable(torch.FloatTensor(np.float32(next_state)))
        action = Variable(torch.LongTensor(action))
        reward = torch.FloatTensor(reward)
        done = torch.FloatTensor(np.float32(done))

        proj_dist = self.projection_distribution(next_state, reward, done)

        dist = self.current_model(state)
        action = action.unsqueeze(1).unsqueeze(1).expand(
            batch_size, 1, num_atoms)
        dist = dist.gather(1, action).squeeze(1)
        dist.data.clamp_(0.01, 0.99)
        loss = -(Variable(proj_dist) * dist.log()).sum(1)
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.current_model.reset_noise()
        self.target_model.reset_noise()

        return loss