예제 #1
0
    def train(self,
              Q: Agent,
              task: Task,
              epsilon: Epsilon,
              alpha: LearningRate,
              episodes,
              cache_train=True,
              test_times=1):
        Q.clear()
        epsilon.clear()
        alpha.clear()
        rewards_history = np.zeros(episodes, dtype=np.float32)
        steps_history = np.zeros(episodes, dtype=np.float32)
        episode_epsilon_history = np.zeros(episodes, dtype=np.float32)
        epsilon_history = []
        conseq_200 = 0
        self.episode = 0
        for e in range(episodes):
            steps, rewards, epsilons = self.run_episode(
                Q, task, epsilon, alpha)
            if cache_train:
                returns = 0.0
                for r in rewards[::-1]:
                    returns = r + self.gamma * returns
            else:
                returns, steps = 0.0, 0.0
                for _ in range(test_times):
                    returns_, steps_ = self.evaluate(Q, task)
                    returns += returns_ / test_times
                    steps += steps_ / test_times
            rewards_history[e] = returns
            steps_history[e] = steps
            episode_epsilon_history[e] = np.mean(epsilons)
            epsilon_history.append(epsilons)
            if e % 10 == 0:
                print('{} {} {}'.format(episode_epsilon_history[e], returns,
                                        steps))
            epsilon.update_end_of_episode(self.episode)
            alpha.update_end_of_episode(self.episode)
            self.episode += 1

            if steps >= 199.99:
                conseq_200 += 1
            else:
                conseq_200 = 0
            # if conseq_200 >= 4:
            #    rewards_history[e:] = rewards_history[e]
            #    steps_history[e:] = steps_history[e]
            #    episode_epsilon_history[e:] = episode_epsilon_history[e]
            #    break

        return steps_history, rewards_history, episode_epsilon_history, \
            np.concatenate(epsilon_history, axis=0)
예제 #2
0
    def train(self, Q: Agent, task: Task, policy: Policy, episodes):
        """ Trains the specified agent on the specified task using the specified
        exploration policy using the current implementation. A specified number of episodes
        is generated for training.
        
        inputs:
            Q - an Agent object storing the Q-values
            task - a Task object representing the task the agent is learning
            policy - a Policy object representing the exploration policy used to 
            balance exploration and exploitation
            episodes - the number of episodes of training to perform
        outputs:
            - a one-dimensional numpy array containing the lengths of each episode - this
            can be used to check the learning progress of the agent
            - a one-dimensional numpy array containing the sum of the discounted 
            rewards from the environment obtained on each episode - this can be used to check
            the learning progress of the agent 
        """

        # initialization
        self.clear()
        Q.clear()
        policy.clear()

        # for storing history of trial
        rewards_history = np.zeros(episodes, dtype=float)
        steps_history = np.zeros(episodes, dtype=int)

        # run episodes
        for e in range(episodes):

            # run an episode of training
            steps, rewards = self.run_episode(Q, task, policy)

            # compute the value of the backup and update the history
            R = 0.0
            for reward in rewards[::-1]:
                R = reward + self.gamma * R
            rewards_history[e] = R
            steps_history[e] = steps

            # finish episode
            policy.finish_episode(e)
            Q.finish_episode(e)

        return steps_history, rewards_history