예제 #1
0
class TestSARSAAgent(unittest.TestCase):
    '''
    Tests the SARSA agent's action methods and backup function.
    '''

    def setUp(self):
        self.agent = SARSAAgent(legal_actions=(0, 1),
                                gamma=0.9, alpha=0.25, epsilon=0.9)

        self.agent.Q[0, 1] = 1.0
        self.agent.Q[0, 2] = 0.5
        self.agent.Q[1, 1] = -2.0
        self.agent.Q[1, 2] = -1.0

    def test_get_greedy_action(self):
        self.assertEqual(self.agent._get_greedy_action(0), 1)
        self.assertEqual(self.agent._get_greedy_action(1), 2)

    def test_get_random_action(self):
        self.assertIn(self.agent._get_random_action(),
            self.agent.legal_actions)

    def test_td_error(self):
        self.assertTrue(self.agent._td_error(0, 0, 1, 1.9, 2) - 1.0 < 1e-10)
        self.assertTrue(self.agent._td_error(0, 0, 1, 1.9, -2) - 1.9 < 1e-10)
예제 #2
0
    def setUp(self):
        self.agent = SARSAAgent(legal_actions=(0, 1),
                                gamma=0.9, alpha=0.25, epsilon=0.9)

        self.agent.Q[0, 1] = 1.0
        self.agent.Q[0, 2] = 0.5
        self.agent.Q[1, 1] = -2.0
        self.agent.Q[1, 2] = -1.0
예제 #3
0
def run_sarsa_vs_qlearning():
    winners = []
    board_length = 8
    action_space = (board_length, board_length, board_length, board_length)

    agent_one = QLearningAgent((board_length, board_length), action_space,
                               "qlearning", "up", 0.0, 250000000, 10000000)
    agent_two = SARSAAgent((board_length, board_length), action_space, "sarsa",
                           "down", 0.0, 25000000, 10000000)
    iterations = 10000
    for i in range(iterations):
        board = Board(board_length=8)
        game = Game(agent_one=agent_one, agent_two=agent_two, board=board)
        game.play(verbose=False)
        winners += [game.winner]
        agent_one.epsilon *= 0.9999
        agent_two.epsilon *= 0.9999
        if (i % 5000 == 0 and i > 0) or iterations - 1 == i:
            victories_player_two = 0
            victories_player_one = 0
            for winner in winners:
                if winner == "qlearning":
                    victories_player_one += 1
                if winner == "sarsa":
                    victories_player_two += 1

            logging.info("Player One: {}".format(str(victories_player_one)))
            logging.info("Player Two: {}".format(str(victories_player_two)))
            logging.info("Mean Rewards Agent One: {}".format(
                agent_one.moving_average_rewards[-1]))
            logging.info("Mean Rewards Agent Two: {}".format(
                agent_two.moving_average_rewards[-1]))
예제 #4
0
def run_a2c_vs_sarsa():
    winners = []
    board_length = 8
    action_space = (board_length, board_length, board_length, board_length)

    agent_one = A2C((board_length, board_length), action_space, "a3c", "up",
                    1.0, 2000, 100000)
    agent_two = SARSAAgent((board_length, board_length),
                           action_space,
                           "sarsa_two",
                           "down",
                           1.0,
                           2000,
                           100000,
                           save_path="../data/modeldata/sarsa_two/model.ckpt")
    iterations = 200000
    for i in range(iterations):
        board = Board(board_length=8)
        game = Game(agent_one=agent_one, agent_two=agent_two, board=board)
        game.play(verbose=False)
        winners += [game.winner]
        agent_one.epsilon *= 0.99999
        if (i % 5000 == 0 and i > 0) or (iterations - 1 == i):
            victories_player_two = 0
            victories_player_one = 0
            for winner in winners:
                if winner == "a3c":
                    victories_player_one += 1
                if winner == "Two":
                    victories_player_two += 1
            logging.info("Current epsilon: {}".format(agent_one.epsilon))
            logging.info("Player One: {}".format(str(victories_player_one)))
            logging.info("Player Two: {}".format(str(victories_player_two)))
            logging.info("Mean Rewards Agent One: {}".format(
                agent_one.moving_average_rewards[-1]))
            logging.info("Mean Rewards Agent Two: {}".format(
                agent_two.moving_average_rewards[-1]))
예제 #5
0
    sarsa_rewards = run_experiment(SARSAAgent, render)
    qlearning_rewards = run_experiment(QLearningAgent, render)
    plt.interactive(False)
    plot_sarsa_vs_qlearning(sarsa_rewards, qlearning_rewards)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='''Executes RL experiments on the Cliff-World domain''')

    parser.add_argument('-r',
                        '--render',
                        action='store_true',
                        help="Toggle Tkinter rendering (off by default)",
                        default=False)
    parser.add_argument(
        '-k',
        '--keyboard',
        action='store_true',
        help="Toggle keyboard mode (runs interactive episodes)",
        default=False)

    args = parser.parse_args()

    if args.keyboard:
        mdp = CliffMDP(12, 4, render=args.render)
        agent = SARSAAgent(legal_actions=mdp.actions, gamma=mdp.gamma)
        run_episode(mdp, agent, kbd_ctl=args.keyboard)
    else:
        compare_sarsa_qlearning(args.render)