Ejemplo n.º 1
0
 def __init__(self, env = hMDP(), num_episodes = 100000, \
             gamma = 0.9, alpha = 0.6, batch_size = 1, epsilon_anneal = 1/50000):
     self.env = env
     self.num_episodes = num_episodes
     self.gamma = gamma
     self.alpha = alpha
     self.batch_size = batch_size
     self.epsilon_anneal = epsilon_anneal
Ejemplo n.º 2
0
 def __init__(self, env = hMDP(), meta_goals = [0, 1, 2, 3, 4, 5], num_episodes = 20000, \
                 gamma = 0.9, batch_size = 32, epsilon_anneal = 1/2000, \
                 meta_epsilon_anneal = 1/12000):
     self.env = env
     self.meta_goals = meta_goals
     self.num_episodes = num_episodes
     self.gamma = gamma
     self.batch_size = batch_size
     self.epsilon_anneal = epsilon_anneal
     self.meta_epsilon_anneal = meta_epsilon_anneal
Ejemplo n.º 3
0
#from agents.hDQN import hDQNAgent

from envs.hmdp import StochastichMDPEnv as hMDP
from envs.mdp import StochasticMDPEnv as MDP

import utils.plotting as plotting
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

num_trials = 20

stats_q_learning = []
for i in range(num_trials):
    q_agent = QLearningAgent(env=hMDP(), num_episodes=25000)
    episode_stats = q_agent.learn()
    stats_q_learning.append(episode_stats)

stats_hq_learning = []
for i in range(num_trials):
    hq_agent = hierarchicalQLearningAgent(env=hMDP(), num_episodes=25000)
    episode_stats = hq_agent.learn()
    stats_hq_learning.append(episode_stats)
'''
stats_dqn = []
for i in range(num_trials):
    dqn_agent = DQNAgent(env=hMDP())
    episode_stats = dqn_agent.learn()
    stats_dqn.append(episode_stats)
Ejemplo n.º 4
0
                    action = self.epsGreedy((s, goal), A, epsilon[goal], Q1)
                    s_next, f, done, _ = self.env.step(action)
                    r = self.intrinsic_reward(s, action, s_next, goal)
                    stats.episode_rewards[i] += f
                    stats.episode_lengths[i] = t
                    stats.visitation_count[s_next, i] += 1

                    D1 = [((s, goal), action, r, (s_next, goal), done)]
                    Q1 = self.QValueUpdate(Q1, D1)
                    F = F + f
                    s = s_next
                    t += 1
                D2 = [(s0, goal, F, s, done)]
                Q2 = self.QValueUpdate(Q2, D2)
                if not done:
                    goal = self.epsGreedy(s, self.meta_goals, epsilon_meta, Q2)
                    stats.target_count[goal, i] += 1
                    epsilon[goal] = max(epsilon[goal] - self.epsilon_anneal, 0.1) if i < self.num_episodes*0.8 else 0

            epsilon_meta = max(epsilon_meta - self.meta_epsilon_anneal, 0.1) if i < self.num_episodes*0.8 else 0


        return stats
        #plotting.plot_episode_stats(stats, smoothing_window=1000)



if __name__ == "__main__":
    agent = hierarchicalQLearningAgent(env=hMDP())
    stats = agent.learn()
    plotting.plot_rewards([stats], smoothing_window=1000)
Ejemplo n.º 5
0
        for i in range(self.num_episodes):
            if i % 1000 == 0:
                print('Episode ', i)
                print(epsilon)
            s = self.env.reset()
            done = False
            t = 0
            while not done:
                action = self.epsGreedy(s, A, epsilon, Q)
                s_next, f, done, _ = self.env.step(action)
                stats.episode_rewards[i] += f
                stats.episode_lengths[i] = t
                stats.visitation_count[s_next, i] += 1

                D = [(s, action, f, s_next, done)]
                Q = self.QValueUpdate(Q, D)
                s = s_next
                t += 1
            epsilon = max(epsilon - self.epsilon_anneal,
                          0.1) if i < self.num_episodes * 0.8 else 0

        return stats
        #plotting.plot_episode_stats(stats, smoothing_window=1000)


if __name__ == "__main__":
    agent = QLearningAgent(env=hMDP())
    stats = agent.learn()
    plotting.plot_rewards([stats], smoothing_window=1000)