exploit_returns += [np.mean(stat.episode_rewards[20000:25000])] print('Q-learning average exploitative return:', np.mean(exploit_returns)) exploit_returns = [] for stat in stats_hq_learning: exploit_returns += [np.mean(stat.episode_rewards[20000:25000])] print('Hierarchical Q-learning average exploitative return:', np.mean(exploit_returns)) ######################################################### plt.figure() plotting.plot_rewards(stats_q_learning, c='g') plotting.plot_rewards(stats_hq_learning, c='b') ''' plotting.plot_rewards(stats_q_learning, c='r') plotting.plot_rewards(stats_hq_learning, c='c') ''' plt.legend(["Q-learning", "Hierarchical Q-learning"]) plt.xlabel("Episode") plt.ylabel("Extrinsic Reward") plt.title("Discrete Stochastic Decision Process") ######################################################### plt.figure()
s_next, f, done, _ = self.env.step(action) r = self.intrinsic_reward(s, action, s_next, goal) stats.episode_rewards[i] += f stats.episode_lengths[i] = t stats.visitation_count[s_next, i] += 1 D1 = [((s, goal), action, r, (s_next, goal), done)] Q1 = self.QValueUpdate(Q1, D1) F = F + f s = s_next t += 1 D2 = [(s0, goal, F, s, done)] Q2 = self.QValueUpdate(Q2, D2) if not done: goal = self.epsGreedy(s, self.meta_goals, epsilon_meta, Q2) stats.target_count[goal, i] += 1 epsilon[goal] = max(epsilon[goal] - self.epsilon_anneal, 0.1) if i < self.num_episodes*0.8 else 0 epsilon_meta = max(epsilon_meta - self.meta_epsilon_anneal, 0.1) if i < self.num_episodes*0.8 else 0 return stats #plotting.plot_episode_stats(stats, smoothing_window=1000) if __name__ == "__main__": agent = hierarchicalQLearningAgent(env=hMDP()) stats = agent.learn() plotting.plot_rewards([stats], smoothing_window=1000)