class DQN(Trainer): def __init__(self, parameters): super(DQN, self).__init__(parameters) self.replay_buffer = ReplayBuffer(self.buffersize) def push_to_buffer(self, state, action, reward, next_state, done): self.replay_buffer.push(state, action, reward, next_state, done) def compute_td_loss(self, batch_size, *args): state, action, reward, next_state, done = self.replay_buffer.sample( batch_size) state = Variable(torch.FloatTensor(np.float32(state))) next_state = Variable(torch.FloatTensor(np.float32(next_state))) action = Variable(torch.LongTensor(action)) reward = Variable(torch.FloatTensor(reward)) done = Variable(torch.FloatTensor(done)) q_values = self.current_model(state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_values = self.current_model(next_state) next_q_state_values = self.target_model(next_state) next_q_value = next_q_state_values.gather( 1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) expected_q_value = reward + self.gamma * next_q_value * (1 - done) loss = (q_value - Variable(expected_q_value.data)).abs() loss[loss.le(1)] = loss[loss.le(1)].pow(2) loss[loss.gt(1)] = 1 #(loss[loss.gt(1)] + 1) / 2 loss = loss.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss
writer.flush() iterations = 1000000 batch_size = 32 gamma = 0.98 losses = [] all_rewards = [] episode_reward = 0 state = env.reset() for iteration in range(1, iterations + 1): action = current_model.act(state, epsilon_by_frame(iteration)) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 if len(replay_buffer) > batch_size: loss = compute_td_loss(batch_size) losses.append(loss.item()) if iteration % 200 == 0: plot(iteration, all_rewards, losses, episode_reward)
class Rainbow(Trainer): def __init__(self, parameters): super(Rainbow, self).__init__(parameters) self.replay_buffer = ReplayBuffer(self.buffersize) def push_to_buffer(self, state, action, reward, next_state, done): self.replay_buffer.push(state, action, reward, next_state, done) def load_model(self): self.current_model = RainbowDQN(self.env.observation_space.shape[0], self.env.action_space.n, num_atoms, Vmin, Vmax) # input:(1,84,84), output:6 self.target_model = RainbowDQN(self.env.observation_space.shape[0], self.env.action_space.n, num_atoms, Vmin, Vmax) if USE_CUDA: self.current_model = self.current_model.cuda() self.target_model = self.target_model.cuda() self.update_target(self.current_model, self.target_model) # sync nets def projection_distribution(self, next_state, rewards, dones): batch_size = next_state.size(0) delta_z = float(Vmax - Vmin) / (num_atoms - 1) support = torch.linspace(Vmin, Vmax, num_atoms) next_dist = self.target_model(next_state).data.cpu() * support next_action = next_dist.sum(2).max(1)[1] next_action = next_action.unsqueeze(1).unsqueeze(1).expand( next_dist.size(0), 1, next_dist.size(2)) next_dist = next_dist.gather(1, next_action).squeeze(1) rewards = rewards.unsqueeze(1).expand_as(next_dist) dones = dones.unsqueeze(1).expand_as(next_dist) support = support.unsqueeze(0).expand_as(next_dist) Tz = rewards + (1 - dones) * 0.99 * support Tz = Tz.clamp(min=Vmin, max=Vmax) b = (Tz - Vmin) / delta_z l = b.floor().long() u = b.ceil().long() offset = torch.linspace(0, (batch_size - 1) * num_atoms, batch_size).long()\ .unsqueeze(1).expand(batch_size, num_atoms) proj_dist = torch.zeros(next_dist.size()) proj_dist.view(-1).index_add_(0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1)) proj_dist.view(-1).index_add_(0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1)) return proj_dist def compute_td_loss(self, batch_size, *args): state, action, reward, next_state, done = self.replay_buffer.sample( batch_size) state = Variable(torch.FloatTensor(np.float32(state))) next_state = Variable(torch.FloatTensor(np.float32(next_state))) action = Variable(torch.LongTensor(action)) reward = torch.FloatTensor(reward) done = torch.FloatTensor(np.float32(done)) proj_dist = self.projection_distribution(next_state, reward, done) dist = self.current_model(state) action = action.unsqueeze(1).unsqueeze(1).expand( batch_size, 1, num_atoms) dist = dist.gather(1, action).squeeze(1) dist.data.clamp_(0.01, 0.99) loss = -(Variable(proj_dist) * dist.log()).sum(1) loss = loss.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.current_model.reset_noise() self.target_model.reset_noise() return loss