import numpy as np import matplotlib matplotlib.use('TkAgg') from lib.envs.bandit import BanditEnv from lib.simulation import Experiment from shared.policy import UCB evaluation_seed = 1239 num_actions = 10 trials = 100 distribution = "normal" env = BanditEnv(num_actions, distribution, evaluation_seed) agent = UCB(num_actions) experiment = Experiment(env, agent) experiment.run_bandit(trials)
"maze": SimpleRoomsEnv(), "grid": SimpleRoomsEnv() } return switcher.get(argument) env = get_env(env_string) if agent_string.startswith('q'): print("Running Q Learning on {} environment for {} epochs".format( env_string, num_iter)) agent = QLearningAgent(range(env.action_space.n), epsilon=epsilon, alpha=alpha, decay_every=decay) experiment = Experiment(env, agent) experiment.run_qlearning(num_iter, interactive) #print("Running Q Learning") elif agent_string.startswith('s'): print("Running SARSA on {} environment for {} epochs".format( env_string, num_iter)) agent = SarsaAgent(range(env.action_space.n), epsilon=epsilon, alpha=alpha, decay_every=decay) experiment = Experiment(env, agent) experiment.run_sarsa(num_iter, interactive) #print("Running SARSA") else: print("Invalid Agent argument")
max_value_indices.append(idx) return np.random.choice(max_value_indices) def learn(self, state1, action1, reward, state2, stop): """ SARSA Update Q(s,a) <- Q(s,a) + alpha * (reward + gamma * Q(s',a') - Q(s,a)) or Q(s,a) <- Q(s,a) + alpha * (td_target - Q(s,a)) or Q(s,a) <- Q(s,a) + alpha * td_delta """ self._Q_table[state1][action1] = self._Q_table[state1][action1] + \ self._alpha * ( reward + self._gamma * max(self._Q_table[state2]) - \ self._Q_table[state1][action1] ) interactive = True env = WindyGridworldEnv() agent = QLearningAgent( range(env.action_space.n), env.S, ) experiment = Experiment(env, agent) experiment.run_sarsa(10, interactive)
import numpy as np import matplotlib matplotlib.use('TkAgg') from lib.envs.simple_rooms import SimpleRoomsEnv from lib.simulation import Experiment from shared.agent import RandomAgent interactive = True max_number_of_episodes = 5 env = SimpleRoomsEnv() agent = RandomAgent(range(env.action_space.n)) experiment = Experiment(env, agent) experiment.run_agent(max_number_of_episodes, interactive)
## TODO 3 ## Implement the q-learning update here """ Q-learning Update: Q(s,a) <- Q(s,a) + alpha * (reward + gamma * max(Q(s') - Q(s,a)) or Q(s,a) <- Q(s,a) + alpha * (td_target - Q(s,a)) or Q(s,a) <- Q(s,a) + alpha * td_delta """ interactive = True env = SimpleRoomsEnv() agent = QLearningAgent(range(env.action_space.n)) experiment = Experiment(env, agent) experiment.run_qlearning(10, interactive) interactive = False env = SimpleRoomsEnv() agent = QLearningAgent(range(env.action_space.n)) experiment = Experiment(env, agent) experiment.run_qlearning(50, interactive) interactive = True env = CliffWalkingEnv() agent = QLearningAgent(range(env.action_space.n)) experiment = Experiment(env, agent) experiment.run_qlearning(10, interactive) interactive = False