class TestSARSAAgent(unittest.TestCase): ''' Tests the SARSA agent's action methods and backup function. ''' def setUp(self): self.agent = SARSAAgent(legal_actions=(0, 1), gamma=0.9, alpha=0.25, epsilon=0.9) self.agent.Q[0, 1] = 1.0 self.agent.Q[0, 2] = 0.5 self.agent.Q[1, 1] = -2.0 self.agent.Q[1, 2] = -1.0 def test_get_greedy_action(self): self.assertEqual(self.agent._get_greedy_action(0), 1) self.assertEqual(self.agent._get_greedy_action(1), 2) def test_get_random_action(self): self.assertIn(self.agent._get_random_action(), self.agent.legal_actions) def test_td_error(self): self.assertTrue(self.agent._td_error(0, 0, 1, 1.9, 2) - 1.0 < 1e-10) self.assertTrue(self.agent._td_error(0, 0, 1, 1.9, -2) - 1.9 < 1e-10)
def setUp(self): self.agent = SARSAAgent(legal_actions=(0, 1), gamma=0.9, alpha=0.25, epsilon=0.9) self.agent.Q[0, 1] = 1.0 self.agent.Q[0, 2] = 0.5 self.agent.Q[1, 1] = -2.0 self.agent.Q[1, 2] = -1.0
def run_sarsa_vs_qlearning(): winners = [] board_length = 8 action_space = (board_length, board_length, board_length, board_length) agent_one = QLearningAgent((board_length, board_length), action_space, "qlearning", "up", 0.0, 250000000, 10000000) agent_two = SARSAAgent((board_length, board_length), action_space, "sarsa", "down", 0.0, 25000000, 10000000) iterations = 10000 for i in range(iterations): board = Board(board_length=8) game = Game(agent_one=agent_one, agent_two=agent_two, board=board) game.play(verbose=False) winners += [game.winner] agent_one.epsilon *= 0.9999 agent_two.epsilon *= 0.9999 if (i % 5000 == 0 and i > 0) or iterations - 1 == i: victories_player_two = 0 victories_player_one = 0 for winner in winners: if winner == "qlearning": victories_player_one += 1 if winner == "sarsa": victories_player_two += 1 logging.info("Player One: {}".format(str(victories_player_one))) logging.info("Player Two: {}".format(str(victories_player_two))) logging.info("Mean Rewards Agent One: {}".format( agent_one.moving_average_rewards[-1])) logging.info("Mean Rewards Agent Two: {}".format( agent_two.moving_average_rewards[-1]))
def run_a2c_vs_sarsa(): winners = [] board_length = 8 action_space = (board_length, board_length, board_length, board_length) agent_one = A2C((board_length, board_length), action_space, "a3c", "up", 1.0, 2000, 100000) agent_two = SARSAAgent((board_length, board_length), action_space, "sarsa_two", "down", 1.0, 2000, 100000, save_path="../data/modeldata/sarsa_two/model.ckpt") iterations = 200000 for i in range(iterations): board = Board(board_length=8) game = Game(agent_one=agent_one, agent_two=agent_two, board=board) game.play(verbose=False) winners += [game.winner] agent_one.epsilon *= 0.99999 if (i % 5000 == 0 and i > 0) or (iterations - 1 == i): victories_player_two = 0 victories_player_one = 0 for winner in winners: if winner == "a3c": victories_player_one += 1 if winner == "Two": victories_player_two += 1 logging.info("Current epsilon: {}".format(agent_one.epsilon)) logging.info("Player One: {}".format(str(victories_player_one))) logging.info("Player Two: {}".format(str(victories_player_two))) logging.info("Mean Rewards Agent One: {}".format( agent_one.moving_average_rewards[-1])) logging.info("Mean Rewards Agent Two: {}".format( agent_two.moving_average_rewards[-1]))
sarsa_rewards = run_experiment(SARSAAgent, render) qlearning_rewards = run_experiment(QLearningAgent, render) plt.interactive(False) plot_sarsa_vs_qlearning(sarsa_rewards, qlearning_rewards) if __name__ == '__main__': parser = argparse.ArgumentParser( description='''Executes RL experiments on the Cliff-World domain''') parser.add_argument('-r', '--render', action='store_true', help="Toggle Tkinter rendering (off by default)", default=False) parser.add_argument( '-k', '--keyboard', action='store_true', help="Toggle keyboard mode (runs interactive episodes)", default=False) args = parser.parse_args() if args.keyboard: mdp = CliffMDP(12, 4, render=args.render) agent = SARSAAgent(legal_actions=mdp.actions, gamma=mdp.gamma) run_episode(mdp, agent, kbd_ctl=args.keyboard) else: compare_sarsa_qlearning(args.render)