def test_zero_step(self): self.memory = ReplayMemory(capacity=10, multi_step_n=0) for i in range(5): a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 1, False) self.memory.push(a) final = Transition([0, 1, 2, 10], 0, [4, 5, 6, 100], 10, True) self.memory.push(final) self.assertEqual(self.memory.memory[0].r, 1) self.assertEqual(self.memory.memory[3].r, 1) self.assertEqual(self.memory.memory[4].r, 1) self.assertEqual(self.memory.memory[5].r, 10)
def run_episode(self): """ Train an NEC agent for a single episode: Interact with environment Append (state, action, reward) transitions to transition queue Call update at the end of the episode """ if self.epsilon > self.final_epsilon: self.epsilon = self.epsilon * self.epsilon_decay state = self.env.reset() if self.environment_type == 'fourrooms': fewest_steps = self.env.shortest_path_length(self.env.state) total_steps = 0 total_reward = 0 total_frames = 0 done = False while not done: state_embedding = torch.tensor(state).permute(2, 0, 1) # (C,H,W) state_embedding = state_embedding.unsqueeze(0).to(self.device) state_embedding = self.cnn(state_embedding) action = self.choose_action(state_embedding) next_state, reward, done, _ = self.env.step(action) self.transition_queue.append(Transition(state, action, reward)) total_reward += reward total_frames += self.env.skip total_steps += 1 state = next_state self.update() if self.environment_type == 'fourrooms': n_extra_steps = total_steps - fewest_steps return n_extra_steps, total_frames, total_reward else: return total_frames, total_reward
def optimize_model(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) # This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) next_states_batch = torch.cat(batch.next_state).view( self.batch_size, -1).to(self.device) state_batch = torch.cat(batch.state).view(self.batch_size, -1).to(self.device) action_batch = torch.cat(batch.action).view(self.batch_size, -1).to(self.device) reward_batch = torch.cat(batch.reward).view(self.batch_size, -1).to(self.device) # Compute loss loss = self._compute_loss(state_batch, action_batch, next_states_batch, reward_batch) # Optimize the model self.optimizer.zero_grad() loss.backward() # clip grad if self.grad_clip is not None: for param in self.policy_net.parameters(): param.grad.data.clamp_(-self.grad_clip, self.grad_clip) # update Policy net weights self.optimizer.step() # update Target net weights self._update_target()
def test_update(self): for i in range(10): a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 0, True) self.memory.push(a) self.memory.update([1, 3], [2, 5]) self.assertEqual(self.memory.errors[1], 2.1) self.assertEqual(self.memory.errors[3], 5.1)
def optimize_policy_model(self): """ performs a single step of optimization for the policy model :return: """ if self.memory.length() < self.batch_size: return # sample a batch transitions = self.memory.sample_batch(self.batch_size) one_batch = Transition(*zip(*transitions)) # create a mask of non-final states non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, one_batch.next_state)), device=self.device, dtype=torch.uint8) # [128] non_final_next_states = torch.cat([ s for s in one_batch.next_state if s is not None ]) # [< 128, 3, 40, 80] # concatenate all batch elements into one state_batch = torch.cat(one_batch.state) # [128, 3, 40, 80] action_batch = torch.cat(one_batch.action) # [128, 1] reward_batch = torch.cat(one_batch.reward) # [128] state_batch = state_batch.to(self.device) non_final_next_states = non_final_next_states.to(self.device) curr_state_values = self.policy_model(state_batch) # [128, 2] curr_state_action_values = curr_state_values.gather( 1, action_batch) # [128, 1] # Get V(s_{t+1}) for all next states. By definition we set V(s)=0 if s is a terminal state. next_state_values = torch.zeros(self.batch_size, device=self.device) # [128] next_state_values[non_final_mask] = self.target_model( non_final_next_states).max(1)[0].detach() # [< 128] # Get the expected Q values expected_state_action_values = ( next_state_values * self.config.gamma) + reward_batch # [128] # compute loss: temporal difference error loss = self.loss(curr_state_action_values, expected_state_action_values.unsqueeze(1)) # optimizer step self.optim.zero_grad() loss.backward() for param in self.policy_model.parameters(): param.grad.data.clamp_(-1, 1) self.optim.step() return loss
def test_sample(self): for i in range(10): a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 0, True) self.memory.push(a) s, a, s1, r, done = self.memory.sample(2) self.assertEqual(s.shape, (2, 4)) self.assertEqual(a.shape, (2, 1)) self.assertEqual(s1.shape, (2, 4)) self.assertEqual(r.shape, (2, 1)) self.assertEqual(done.shape, (2, 1))
def warmup(self): """ Warmup the DND with values from an episode with a random policy """ state = self.env.reset() total_reward = 0 total_frames = 0 done = False while not done: action = random.randint(0, self.env.action_space.n - 1) next_state, reward, done, _ = self.env.step(action) total_reward += reward total_frames += self.env.skip self.transition_queue.append(Transition(state, action, reward)) state = next_state for t in range(len(self.transition_queue)): tr = self.transition_queue[t] state_embedding = torch.tensor(tr.state).permute(2, 0, 1) # (C,H,W) state_embedding = state_embedding.unsqueeze(0).to(self.device) state_embedding = self.cnn(state_embedding) action = tr.action dnd = self.dnd_list[action] Q_N = self.Q_lookahead(t, True).to(self.device) if dnd.keys_to_be_inserted is None and dnd.keys is None: dnd.insert(state_embedding, Q_N.detach().unsqueeze(0)) else: embedding_index = dnd.get_index(state_embedding) if embedding_index is None: state_embedding = state_embedding.detach() dnd.insert(state_embedding, Q_N.detach().unsqueeze(0)) else: Q = self.Q_update(dnd.values[embedding_index], Q_N) dnd.update(Q.detach(), embedding_index) self.replay_memory.push(tr.state, action, Q_N.detach()) for dnd in self.dnd_list: dnd.commit_insert() # Clear out transition queue self.transition_queue = [] return total_frames, total_reward
def train_policy_with_a_batch(replay_memory, policy, target, batch_size, optimizer, gamma): if len(replay_memory) < batch_size * 10: return transitions = replay_memory.sample(batch_size) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(list( map(lambda x: x is not None, batch.next_state)), device=device, dtype=torch.uint8) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action).unsqueeze(1) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken state_action_values = policy(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. next_state_values = torch.zeros(batch_size, device=device) next_state_values[non_final_mask] = target(non_final_next_states).max( 1)[0].detach() # Compute the expected Q values expected_state_action_values = (next_state_values * gamma) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model optimizer.zero_grad() loss.backward() for param in policy.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step()
def episode(self): """ Train an NEC agent for a single episode Interact with environment on-policy and append all (state, action, reward) transitions to transition queue Call update at the end of every episode """ if self.epsilon > self.final_epsilon: self.epsilon = self.epsilon * self.epsilon_decay state = self.env.reset() total_reward = 0 done = False while not done: state_embedding = self.embedding_network( Variable(Tensor(state)).unsqueeze(0)) #action = self.choose_action(state_embedding) next_state, reward, done, action = self.env.step() self.transition_queue.append(Transition(state, action, reward)) total_reward += reward state = next_state self.update() return total_reward
def warmup(self): """ Warmup the DND with values from an episode with a random policy """ state = self.env.reset() total_reward = 0 done = False while not done: #action = random.randint(0, self.env.action_space_n - 1) next_state, reward, done, action = self.env.step() total_reward += reward self.transition_queue.append(Transition(state, action, reward)) state = next_state for t in range(len(self.transition_queue)): transition = self.transition_queue[t] state = Variable(Tensor(transition.state)).unsqueeze(0) action = transition.action state_embedding = self.embedding_network(state) dnd = self.dnd_list[action] Q_N = self.Q_lookahead(t, True) if dnd.keys_to_be_inserted is None and dnd.keys is None: dnd.insert(state_embedding, Q_N.detach().unsqueeze(0)) else: embedding_index = dnd.get_index(state_embedding) if embedding_index is None: dnd.insert(state_embedding.detach(), Q_N.detach().unsqueeze(0)) else: Q = self.Q_update(dnd.values[embedding_index], Q_N) dnd.update(Q.detach(), embedding_index) self.replay_memory.push(transition.state, action, Q_N) [dnd.commit_insert() for dnd in self.dnd_list] # Clear out transition queue self.transition_queue = [] return total_reward
mask = torch.Tensor([done]).to(device) reward = torch.Tensor([reward]).to(device) next_state = torch.Tensor([next_state]).to(device) memory.push(state, action, mask, next_state, reward) state = next_state epoch_value_loss = 0 epoch_policy_loss = 0 if len(memory) > args.batch_size: transitions = memory.sample(args.batch_size) # Transpose the batch # (see http://stackoverflow.com/a/19343/3343043 for detailed explanation). batch = Transition(*zip(*transitions)) # Update actor and critic according to the batch value_loss, policy_loss = agent.update_params(batch) epoch_value_loss += value_loss epoch_policy_loss += policy_loss if done: break rewards.append(epoch_return) value_losses.append(epoch_value_loss) policy_losses.append(epoch_policy_loss) writer.add_scalar('epoch/return', epoch_return, epoch)
def run_loop(self, env, max_frames=0): """A run loop to have agents and an environment interact.""" total_frames = 0 start_time = time.time() action_spec = env.action_spec() observation_spec = env.observation_spec() self.setup(observation_spec, action_spec) try: while True: obs = env.reset()[0] # remove unit selection from the equation by selecting the friendly on every new game. select_friendly = self.select_friendly_action(obs) obs = env.step([select_friendly])[0] # distance = self.get_reward(obs.observation["screen"]) self.reset() while True: total_frames += 1 self._screen = obs.observation["screen"][5] s = np.expand_dims(obs.observation["screen"][5], 0) # plt.imshow(s[5]) # plt.pause(0.00001) if max_frames and total_frames >= max_frames: print("max frames reached") return if obs.last(): print("total frames:", total_frames, "Epsilon:", self._epsilon.value()) self._epsilon.increment() break action = self.get_action(s) env_actions = self.get_env_action(action, obs) obs = env.step([env_actions])[0] r = obs.reward s1 = np.expand_dims(obs.observation["screen"][5], 0) done = r > 0 if self._epsilon.isTraining: transition = Transition(s, action, s1, r, done) self._memory.push(transition) if total_frames % self.train_q_per_step == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining: self.train_q() # pass if total_frames % self.target_q_update_frequency == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining: self._Qt = copy.deepcopy(self._Q) self.show_chart() if total_frames % 1000 == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining: self.show_chart() if not self._epsilon.isTraining and total_frames % 3 == 0: self.show_chart() except KeyboardInterrupt: pass finally: print("finished") elapsed_time = time.time() - start_time print("Took %.3f seconds for %s steps: %.3f fps" % (elapsed_time, total_frames, total_frames / elapsed_time))
def load_game_from_replay_memory(): transitions = replay_memory.sample(batch_size) batch = Transition(*zip(*transitions)) return batch
def test_append(self): for i in range(20): a = Transition([0, 1, 2, 3], 0, [4, 5, 6, 7], 0, True) self.memory.push(a) self.assertEqual(len(self.memory.memory), 10)