def _loop(self, is_train): # internal loop for both training and validation done = False state_before = self.env.reset() if is_train else self.val_env.reset() loss_list = [] reward_list = [] while not done: epsilon = self.agent.epsilon if is_train else self._val_epsilon action = self.agent.policy(state_before, epsilon) state_after, reward, done, _ = self.env.step( action) if is_train else self.val_env.step(action) if is_train: self._step += 1 self.memory( Transition(state_before, action, reward, state_after, done)) self.agent.parameter_scheduler(self._step) loss_list.append(self._train_nn()) state_before = state_after reward_list.append(reward) if self._step % self.target_update_freq == 0 and is_train: self.agent.update_target_net() if self._step % self._val_freq == 0 and is_train: self.val() return loss_list, reward_list, state_after
def warm_up(self): # to populate replay memory state_before = self.env.reset() self._warmed = True for _ in tqdm(range(self.replay_start)): action = self.env.action_space.sample() state_after, reward, done, _ = self.env.step(action) self.memory(Transition(state_before, action, reward, state_after, done)) state_before = self.env.reset() if done else state_after