class TestQLearningAgent(unittest.TestCase): ''' Tests the Q-learning agent's action methods and backup function. ''' def setUp(self): self.agent = QLearningAgent(legal_actions=(0, 1), gamma=0.9, alpha=0.25, epsilon=0.9) self.agent.Q[0, 1] = 1.0 self.agent.Q[0, 2] = 0.5 self.agent.Q[1, 1] = -2.0 self.agent.Q[1, 2] = -1.0 def test_get_greedy_action(self): self.assertEqual(self.agent._get_greedy_action(0), 1) self.assertEqual(self.agent._get_greedy_action(1), 2) def test_get_random_action(self): self.assertIn(self.agent._get_random_action(), self.agent.legal_actions) def test_td_error(self): self.assertTrue(abs(self.agent._td_error(0, 0, 1, 1.9) - 1.0)\ < 1e-10) self.assertTrue(abs(self.agent._td_error(0, 0, 1, -0.5) + 1.4)\ < 1e-10)
def setUp(self): self.agent = QLearningAgent(legal_actions=(0, 1), gamma=0.9, alpha=0.25, epsilon=0.9) self.agent.Q[0, 1] = 1.0 self.agent.Q[0, 2] = 0.5 self.agent.Q[1, 1] = -2.0 self.agent.Q[1, 2] = -1.0
def run_sarsa_vs_qlearning(): winners = [] board_length = 8 action_space = (board_length, board_length, board_length, board_length) agent_one = QLearningAgent((board_length, board_length), action_space, "qlearning", "up", 0.0, 250000000, 10000000) agent_two = SARSAAgent((board_length, board_length), action_space, "sarsa", "down", 0.0, 25000000, 10000000) iterations = 10000 for i in range(iterations): board = Board(board_length=8) game = Game(agent_one=agent_one, agent_two=agent_two, board=board) game.play(verbose=False) winners += [game.winner] agent_one.epsilon *= 0.9999 agent_two.epsilon *= 0.9999 if (i % 5000 == 0 and i > 0) or iterations - 1 == i: victories_player_two = 0 victories_player_one = 0 for winner in winners: if winner == "qlearning": victories_player_one += 1 if winner == "sarsa": victories_player_two += 1 logging.info("Player One: {}".format(str(victories_player_one))) logging.info("Player Two: {}".format(str(victories_player_two))) logging.info("Mean Rewards Agent One: {}".format( agent_one.moving_average_rewards[-1])) logging.info("Mean Rewards Agent Two: {}".format( agent_two.moving_average_rewards[-1]))
def __init__(self, num_agents, **kwargs): super().__init__(**kwargs) self.agents = [QLearningAgent(**kwargs) for i in range(num_agents)] self.name = 'MultiAgent' qTable = self.agents[0].qValues for agent in self.agents[1:]: agent.qValues = qTable
def run(algo="sarsa", num_episodes=500, min_seed=0, seed_range=20, kings_move_allowed=False, stochastic_wind=False, num_steps_lim=100000, info=""): time_steps = [] episode_lengths = [] episode_rewards = [] print( "=======================================================================================" ) print("Running {algo} agent on windy gridworld {info} [seeds = {s}-{e}]". format(algo=algo, info=info, s=min_seed, e=min_seed + seed_range - 1)) print( "=======================================================================================" ) for seed in range(min_seed, min_seed + seed_range): env = WindyGridworldEnv(kings_move_allowed, seed=seed, stochastic_wind=stochastic_wind) if algo == "sarsa": agent = SarsaAgent(num_states=env.nS, actions=range(env.nA), seed=seed) elif algo == "q-learning": agent = QLearningAgent(num_states=env.nS, actions=range(env.nA), seed=seed) elif algo == 'expected-sarsa': agent = ExpectedSarsaAgent(num_states=env.nS, actions=range(env.nA), seed=seed) experiment = Experiment(env, agent) expt_time_steps, expt_episode_lengths, expt_episode_rewards = experiment.run( num_episodes, num_steps_lim, algo=algo) time_steps.append(expt_time_steps) episode_lengths.append(expt_episode_lengths) episode_rewards.append(expt_episode_rewards) time_steps = np.mean(np.array(time_steps), axis=0) episode_lengths = np.mean(np.array(episode_lengths), axis=0) episode_rewards = np.mean(np.array(episode_rewards), axis=0) print("Average Last Episode Length (across seeds): {}".format( episode_lengths[-1])) print("Average Last Episode Reward (across seeds): {}".format( episode_rewards[-1])) print("Average Episode Length (over last 20 episodes, across seeds): {}". format(np.mean(episode_lengths[-20]))) print("Average Episode Reward (over last 20 episodes, across seeds): {}". format(np.mean(episode_rewards[-20]))) return time_steps, episode_lengths, episode_rewards
def run(self): print('\tValue Iteration') vi_agent = ValueIterationAgent(self.env, self.gamma) print('\t\tAverage reward: ' + str(np.mean(vi_agent.scores))) print('\t\tConvergence step: ' + str(vi_agent.convergence)) print('\t\tPolicy: ' + str(vi_agent.policy)) print('\tPolicy Iteration') self.env.reset() pi_agent = PolicyIterationAgent(self.env, self.gamma) print('\t\tAverage reward: ' + str(np.mean(pi_agent.scores))) print('\t\tConvergence step: ' + str(pi_agent.convergence)) print('\t\tPolicy: ' + str(pi_agent.policy)) print('\tQ Learning') self.env.reset() ql_agent = QLearningAgent(self.env) q, stats = ql_agent.q_learning(self.env, 500) plotting.plot_episode_stats(stats, experiment_name=self.name)
def fig_6_4(): mdp = CliffGridworld() sarsa_sum_rewards = [] qlearning_sum_rewards = [] rewards_history = deque(maxlen=10) n_episodes = 500 # run sarsa agent agent = SarsaAgent(mdp=mdp) for i in range(n_episodes): states, actions, rewards = agent.run_episode() rewards_history.append(rewards) sarsa_sum_rewards.append(np.mean(rewards_history)) print_optimal_policy(mdp, agent) print_values(mdp, agent) rewards_history.clear() # run q learning agent = QLearningAgent(mdp=mdp) for i in range(n_episodes): states, actions, rewards = agent.run_episode() rewards_history.append(rewards) qlearning_sum_rewards.append(np.mean(rewards_history)) print_optimal_policy(mdp, agent) print_values(mdp, agent) # plot results plt.plot(np.arange(n_episodes), sarsa_sum_rewards, label='Sarsa') plt.plot(np.arange(n_episodes), qlearning_sum_rewards, label='Q-learning') plt.ylim(-100, 0) plt.xlim(0, 500) plt.xlabel('Episodes') plt.ylabel('Sum of rewards during episode') plt.legend() plt.savefig('figures/ch06_fig_6_4.png') plt.close()
def fig_6_7(): mdp = MDP() epsilon = 0.1 alpha = 0.1 agents = [QLearningAgent(mdp=mdp, epsilon=epsilon, alpha=alpha), QQAgent(mdp=mdp, epsilon=epsilon, alpha=alpha)] n_runs = 10000 n_episodes = 300 actions_taken = np.zeros((len(agents), n_runs, n_episodes)) rewards_received = np.zeros_like(actions_taken) for r in tqdm(range(n_runs)): # reset the agents for a in agents: a.reset() for e in range(n_episodes): for i, a in enumerate(agents): _, actions, rewards = a.run_episode() actions_taken[i, r, e] = actions[0] # only plotting % left action from A (which is the first action in the sequence rewards_received[i, r, e] = rewards actions_fraction = np.mean(actions_taken - 1, axis=1) / -2 plt.plot(np.arange(n_episodes), actions_fraction[0], label='Q-learning') plt.plot(np.arange(n_episodes), actions_fraction[1], label='Double Q-learning') optimal = epsilon/len(mdp.get_possible_actions(mdp.start_state)) plt.gca().axhline(optimal, linestyle='dashed', lw=0.5, label=r'{:.0%} optimal at $\epsilon\$'.format(optimal), c='black') plt.xlim(0,n_episodes) plt.ylim(0,1) plt.xlabel('Episodes') plt.ylabel('% left actions from A') plt.legend() plt.savefig('figures/ch06_fig_6_7.png') plt.close()
def fig_6_6(): # initialize mdp = CliffGridworld() agents = [ SarsaAgent(mdp=mdp), ExpectedSarsaAgent(mdp=mdp), QLearningAgent(mdp=mdp) ] alphas = np.linspace(0.1, 1, 10) plt.figure(figsize=(10, 8)) def run_experiment(mdp, agents, n_runs, n_episodes, alphas=alphas): avg_sum_rewards = np.zeros((len(agents), len(alphas))) for i, alpha in enumerate(alphas): # set new alpha for each agent for a in agents: a.alpha = alpha for r in tqdm(range(n_runs)): # rest q_values on each run for a in agents: a.reset() # reset averages over episodes at each run avg_over_episodes = np.zeros(len(agents)) for e in range(n_episodes): for j, a in enumerate(agents): _, _, rewards = a.run_episode() # update avg over episodes online avg_over_episodes[j] += 1 / (e + 1) * ( rewards - avg_over_episodes[j]) # update avg over runs online avg_sum_rewards[:, i] += 1 / (r + 1) * (avg_over_episodes - avg_sum_rewards[:, i]) return avg_sum_rewards # run interim print('Running interim') avg_sum_rewards = run_experiment(mdp, agents, n_runs=500, n_episodes=100) # plot results plt.plot(alphas, avg_sum_rewards[0], label='Sarsa (interim)', linestyle='dotted', c='blue', lw=0.5, marker='v', markerfacecolor='none') plt.plot(alphas, avg_sum_rewards[1], label='Expected Sarsa (interim)', linestyle='dotted', c='red', lw=0.5, marker='x') plt.plot(alphas, avg_sum_rewards[2], label='Q-learning (interim)', linestyle='dotted', c='black', lw=0.5, marker='s', markerfacecolor='none') # run asymptotic print('Running asymptotic') avg_sum_rewards = run_experiment(mdp, agents, n_runs=10, n_episodes=1000) # plot results plt.plot(alphas, avg_sum_rewards[0], label='Sarsa (asymptotic)', c='blue', lw=0.5, marker='v', markerfacecolor='none') plt.plot(alphas, avg_sum_rewards[1], label='Expected Sarsa (asymptotic)', c='red', lw=0.5, marker='x') plt.plot(alphas, avg_sum_rewards[2], label='Q-learning (asymptotic)', c='black', lw=0.5, marker='s', markerfacecolor='none') # format plot plt.xlabel(r'$\alpha$') plt.xlim(0.1, 1) plt.xticks(np.linspace(0.1, 1, 10)) plt.ylim(-150, 0) plt.ylabel('Sum of rewards per episode') plt.legend() plt.savefig('figures/ch06_fig_6_6.png') plt.close()
'explored_states_avg': float(explored_states_cum) / float(iterations_count), 'reward_cum_avg': float(reward_count) / float(iterations_count), 'actions_avg': float(actions_count) / float(iterations_count), } stats.append(row) for q_init_value in [0.0]: for alpha_rate in [0.5]: for epsilon_rate in [0.0]: for gamma_rate in [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.00]: print "Simulating for q_init_value: {}, alpha_rate: {}, epsilon_rate: {}, gamma_rate: {}". \ format(q_init_value, alpha_rate, epsilon_rate, gamma_rate) e = Environment() a = QLearningAgent( e, alpha_rate=alpha_rate, epsilon_rate=epsilon_rate, gamma_rate=gamma_rate, q_init_value=q_init_value ) e.set_primary_agent(a, enforce_deadline=True) s = Simulator(e, update_delay=0.0000001, display=False) s.run(n_trials=n_trials) stats_by_iteration = a.stats_by_simulation_get() aggregated_stats_build_row(q_init_value, alpha_rate, epsilon_rate, gamma_rate, stats_by_iteration) df = pd.DataFrame( data=stats, columns=['q_init_value', 'alpha_rate', 'epsilon_rate', 'gamma_rate', 'success_perc', 'traffic_violations_avg', 'explored_states_avg', 'reward_cum_avg', 'actions_avg'] ) df.to_csv('qlearn_agent_tuned_grid_for_gamma.csv')
from dungeon_game import Dungeon from agents import RandomAgent, AccountantAgent, QLearningAgent, DeepQLearningAgent if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--agent', type=str, default='RANDOM') parser.add_argument('--learning_rate', type=float, default=0.1) parser.add_argument('--discount', type=float, default=0.95) parser.add_argument('--iterations', type=int, default=5000) parser.add_argument('--plot', action='store_true') FLAGS, unparsed = parser.parse_known_args() randomAgent = RandomAgent() accountantAgent = AccountantAgent() qLearningAgent = QLearningAgent(iterations=FLAGS.iterations) deepQLearningAgent = DeepQLearningAgent(iterations=FLAGS.iterations) agent_list = [randomAgent, accountantAgent, qLearningAgent, deepQLearningAgent] rewards = [list() for _ in range(len(agent_list))] dungeon_list = [Dungeon() for _ in range(len(agent_list))] for agent_number, agent in enumerate(agent_list): for step in range(FLAGS.iterations): old_state = dungeon_list[agent_number].state action = agent.get_next_action(old_state) new_state, reward = dungeon_list[agent_number].perform_action(action) agent.update(old_state, new_state, action, reward) rewards[agent_number].append(reward)
pickle.dump(p, open(file_name, "wb")) except KeyboardInterrupt: print('Interrupted') for pick in pickle_index: player = players[pick] player.last_state = None file_name = directory + "\\" + "QLAGENT_GAMES_" + str( num_games) + get_time_stamp() + ".p" pickle.dump(player, open(file_name, "wb")) try: sys.exit(0) except SystemExit: os._exit(0) def get_time_stamp(): time_stamp = dt.datetime.now() day = str(time_stamp.day) hour = str(time_stamp.hour) minute = str(time_stamp.minute) second = str(time_stamp.second) full_ft = '-'.join([day, hour, minute, second]) return full_ft if __name__ == "__main__": ql_agent = QLearningAgent("Learning Agent Demo") random_agent = RandomAgent("Random Agent") players = [ql_agent, random_agent] run_x_games_and_pickle(players, 10000, pickle_index=[ql_agent.index])
world[round(WORLD_DIM / 2) - 1, WORLD_DIM - 1] = '♚' #in caso sia sparito for i in range(WORLD_DIM): for j in range(WORLD_DIM): if world[i, j] in ['☠', '█', '♚']: continue action = np.argmax(qtable[i, j]) if action == 0: world[i, j] = '⬆' elif action == 1: world[i, j] = '⬇' elif action == 2: world[i, j] = '⬅' elif action == 3: world[i, j] = '➡' renderer = Renderer(world, cell_size=100) return renderer.render(episode) if __name__ == '__main__': from agents import QLearningAgent, WORLD_DIM from CreateBilboWorld import * import os import numpy as np bilbo = QLearningAgent(PLAYER_CHAR) mondo = World(WORLD_DIM, bilbo=bilbo, obstacle=True) qtable = np.load("./models/qtable_" + str(WORLD_DIM) + ".npy") img = render_world(mondo.world, WORLD_DIM, qtable) img.show()
from simulator import Simulator from environment import Environment from agents import QLearningAgent environment = Environment(debug_traces=True) qlearn_agent_tuned = QLearningAgent(environment, alpha_rate=0.5, epsilon_rate=0.0, gamma_rate=0.5, q_init_value=0.0, debug_traces=True) environment.set_primary_agent(qlearn_agent_tuned, enforce_deadline=True) simulator = Simulator(environment, update_delay=0.00001, display=False, debug_traces=True) simulator.run(n_trials=100) df = qlearn_agent_tuned.stats_by_simulation_get_as_df() df.to_csv('stats_tuned_qlearn_agent.csv')
import sys if "../" not in sys.path: sys.path.append("../") from lib.envs.cliff_walking import CliffWalkingEnv from lib import plotting from agents import QLearningAgent import numpy as np env_shape = (4, 12) start_position = (3, 0) end_positions = [(3, 11)] cliff = tuple((3, i + 1) for i in range(10)) env = CliffWalkingEnv(env_shape, start_position, end_positions, cliff) n_actions = env.action_space.n agent = QLearningAgent(alpha=0.5, epsilon=0.1, discount=0.99, n_actions=n_actions) agent.train(env, n_episodes=1000, t_max=10**3, verbose=True, verbose_per_episode=500) plotting.draw_policy(env, agent) plotting.plot_episode_stats(agent)
from agents import RandomAgent, QLearningAgent from spades import run_x_games_and_pickle if __name__ == "__main__": players_4 = [ QLearningAgent("Learning Agent"), RandomAgent("Random1"), RandomAgent("Random2"), RandomAgent("Random3") ] run_x_games_and_pickle(players_4, 30000, pickle_index=["Learning Agent"])
from simulator import Simulator from environment import Environment from agents import QLearningAgent environment = Environment(debug_traces=True) qlearn_agent_tuned = QLearningAgent( environment, alpha_rate=0.5, epsilon_rate=0.0, gamma_rate=0.5, q_init_value=0.0, debug_traces=True ) environment.set_primary_agent(qlearn_agent_tuned, enforce_deadline=True) simulator = Simulator(environment, update_delay=0.5, display=True, debug_traces=True) simulator.run(n_trials=5) simulator = Simulator(environment, update_delay=0.0000001, display=False, debug_traces=True) simulator.run(n_trials=90) simulator = Simulator(environment, update_delay=1.00, display=True, debug_traces=True) simulator.run(n_trials=5) # # df = qlearn_agent_tuned.stats_by_simulation_get_as_df() # df.to_csv('stats_tuned_qlearn_agent.csv')
import numpy as np import gym import plotting from agents import ValueIterationAgent, PolicyIterationAgent, QLearningAgent gamma = .1 # Value Iteration ####################################################### env = gym.make('FrozenLake-v0') agent = ValueIterationAgent(env, gamma) print('Average reward: ' + str(np.mean(agent.scores))) # Policy Iteration ####################################################### env = gym.make('FrozenLake-v0') agent = PolicyIterationAgent(env, gamma) print('Average reward: ' + str(np.mean(agent.scores))) # Q Learning ####################################################### env = gym.make('FrozenLake-v0') agent = QLearningAgent(env) Q, stats = agent.q_learning(env, 500) plotting.plot_episode_stats(stats)
possible_moves = {'up': 0, 'down': 1, 'left': 2, 'right': 3} inverse_possible_moves = {0: 'up', 1: 'down', 2: 'left', 3: 'right'} file_name = "./models/qtable_" + str(WORLD_DIM) q_table=np.array([[[0.0 for moves in possible_moves] for x in range(WORLD_DIM)] for y in range(WORLD_DIM)]) \ if not os.path.isfile(file_name + ".npy") \ else np.load(file_name + ".npy") fig = plt.figure(figsize=(20, 20)) bilbo = QLearningAgent(PLAYER_CHAR) mondo = World(WORLD_DIM, bilbo=bilbo, obstacle=True) game_ended = False epoch = 0 anim = [] rewards = 0 env = mondo.create_env(d) anim.append((plt.pcolormesh(env, cmap='CMRmap'), )) while not game_ended and epoch < MAX_EPOCH: epoch += 1 action = bilbo.get_action(0, q_table, possible_moves) bilbo.move(inverse_possible_moves[action])() game_ended = bilbo.game_ended() reward = bilbo.reward()
def fullTest(self): self.game.addAgent(ExpectimaxAgent(BlackjackPlayer.id)) self.result_table[1] = 0 self.game.addAgent(QLearningAgent(BlackjackPlayer.id)) self.result_table[2] = 0 self.doTests()
from simulator import Simulator from environment import Environment from agents import QLearningAgent environment = Environment(debug_traces=False) qlearn_agent = QLearningAgent(environment) environment.set_primary_agent(qlearn_agent, enforce_deadline=True) simulator = Simulator(environment, update_delay=0.00001, display=False) simulator.run(n_trials=100) df = qlearn_agent.stats_by_simulation_get_as_df() df.to_csv('main_q-agent_first_stats.csv')
def __init__(self, agent=QLearningAgent(id=1)): super().__init__() self.learning_agent = agent self.addPlayers()
def setUp(self) -> None: self.test_players = [QLearningAgent(1), RandomAgent(2)] self.game = spades.Spades(self.test_players)
else np.load(file_name + ".npy") fig = plt.figure(figsize=(20, 20)) epochs = [] rewards = [] epsilons = [] tot_wins = [] tot_loss = [] policy = [] win = 0 loss = 0 for ep in range(TOT_EPISODES): #recreate the environment bilbo = QLearningAgent(PLAYER_CHAR) mondo = World(WORLD_DIM, bilbo=bilbo, obstacle=True) np.random.seed() game_ended = False epoch = 0 tot_reward = 0 #if ep % 10 == 0: #a = plt.imshow(render_world(mondo.world,WORLD_DIM,q_table,ep), animated=True) #policy.append((a,)) while not game_ended and epoch < MAX_EPOCH: #the near it gets to the dragon the more random the movement epoch += 1 epsilon_fear = bilbo.fear(epsilon) action = bilbo.get_action(epsilon, q_table, possible_moves) current_state = bilbo.get_current_state() #treasure_gone = bilbo.treasure_gone()
def testQLearn(self): self.game.addAgent(QLearningAgent(BlackjackPlayer.id)) self.result_table[1] = 0 self.doTests()