def train(self, Q: Agent, task: Task, epsilon: Epsilon, alpha: LearningRate, episodes, cache_train=True, test_times=1): Q.clear() epsilon.clear() alpha.clear() rewards_history = np.zeros(episodes, dtype=np.float32) steps_history = np.zeros(episodes, dtype=np.float32) episode_epsilon_history = np.zeros(episodes, dtype=np.float32) epsilon_history = [] conseq_200 = 0 self.episode = 0 for e in range(episodes): steps, rewards, epsilons = self.run_episode( Q, task, epsilon, alpha) if cache_train: returns = 0.0 for r in rewards[::-1]: returns = r + self.gamma * returns else: returns, steps = 0.0, 0.0 for _ in range(test_times): returns_, steps_ = self.evaluate(Q, task) returns += returns_ / test_times steps += steps_ / test_times rewards_history[e] = returns steps_history[e] = steps episode_epsilon_history[e] = np.mean(epsilons) epsilon_history.append(epsilons) if e % 10 == 0: print('{} {} {}'.format(episode_epsilon_history[e], returns, steps)) epsilon.update_end_of_episode(self.episode) alpha.update_end_of_episode(self.episode) self.episode += 1 if steps >= 199.99: conseq_200 += 1 else: conseq_200 = 0 # if conseq_200 >= 4: # rewards_history[e:] = rewards_history[e] # steps_history[e:] = steps_history[e] # episode_epsilon_history[e:] = episode_epsilon_history[e] # break return steps_history, rewards_history, episode_epsilon_history, \ np.concatenate(epsilon_history, axis=0)
def train(self, Q: Agent, task: Task, policy: Policy, episodes): """ Trains the specified agent on the specified task using the specified exploration policy using the current implementation. A specified number of episodes is generated for training. inputs: Q - an Agent object storing the Q-values task - a Task object representing the task the agent is learning policy - a Policy object representing the exploration policy used to balance exploration and exploitation episodes - the number of episodes of training to perform outputs: - a one-dimensional numpy array containing the lengths of each episode - this can be used to check the learning progress of the agent - a one-dimensional numpy array containing the sum of the discounted rewards from the environment obtained on each episode - this can be used to check the learning progress of the agent """ # initialization self.clear() Q.clear() policy.clear() # for storing history of trial rewards_history = np.zeros(episodes, dtype=float) steps_history = np.zeros(episodes, dtype=int) # run episodes for e in range(episodes): # run an episode of training steps, rewards = self.run_episode(Q, task, policy) # compute the value of the backup and update the history R = 0.0 for reward in rewards[::-1]: R = reward + self.gamma * R rewards_history[e] = R steps_history[e] = steps # finish episode policy.finish_episode(e) Q.finish_episode(e) return steps_history, rewards_history