Ejemplo n.º 1
0
    def _loop(self, is_train):
        # internal loop for both training and validation
        done = False
        state_before = self.env.reset() if is_train else self.val_env.reset()
        loss_list = []
        reward_list = []
        while not done:
            epsilon = self.agent.epsilon if is_train else self._val_epsilon
            action = self.agent.policy(state_before, epsilon)
            state_after, reward, done, _ = self.env.step(
                action) if is_train else self.val_env.step(action)

            if is_train:
                self._step += 1
                self.memory(
                    Transition(state_before, action, reward, state_after,
                               done))
                self.agent.parameter_scheduler(self._step)
                loss_list.append(self._train_nn())

            state_before = state_after
            reward_list.append(reward)
            if self._step % self.target_update_freq == 0 and is_train:
                self.agent.update_target_net()

            if self._step % self._val_freq == 0 and is_train:
                self.val()

        return loss_list, reward_list, state_after
Ejemplo n.º 2
0
 def warm_up(self):
     # to populate replay memory
     state_before = self.env.reset()
     self._warmed = True
     for _ in tqdm(range(self.replay_start)):
         action = self.env.action_space.sample()
         state_after, reward, done, _ = self.env.step(action)
         self.memory(Transition(state_before, action, reward, state_after, done))
         state_before = self.env.reset() if done else state_after