def test_stick_20_policy_save(self): game = Easy21() policy = Stick20ActionPolicy(game.action_space) for time_steps in [1_000]: mc = MonteCarloPolicyEvaluation(env=game, policy=policy) mc.learn(total_timesteps=time_steps) mc.save("stick20_%s" % time_steps)
def test_game_reset(self): game = Easy21() self.assertEqual(game.score_dealer, 0) self.assertEqual(game.score_player, 0) game.reset() self.assertNotEqual(game.score_dealer, 0) self.assertNotEqual(game.score_player, 0)
def test_game_step_stick(self): game = Easy21() game.reset() (dealer, player), reward, done, _ = game.step(action=0) self.assertEqual(done, True, msg="Done should be 'True'") self.assertIn(reward, [-1, 0, 1], msg="Rewards should be in [-1, 0, 1]") self.__assert_scores(dealer, player, reward)
def test_epsilon_greedy__save(self): for _lambda in [.0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1]: print("Training with lambda: %s" % _lambda) game = Easy21() policy = EpsilonGreedyActionPolicy(game.action_space) model = Sarsa(env=game, policy=policy, _lambda=_lambda) model.learn(total_timesteps=1000) model.save("greedy_1000_%s" % _lambda)
def test_random_policy(self): game = Easy21() policy = RandomActionPolicy(game.action_space) for time_steps in [1_000, 10_000, 100_000]: mc = MonteCarloPolicyEvaluation(env=game, policy=policy) mc.learn(total_timesteps=time_steps) game_plots.plot(lambda d, p: mc.q_value(observation=(d, p), action=ACTION_STICK)) game_plots.plot(lambda d, p: mc.q_value(observation=(d, p), action=ACTION_HIT)) game_plots.plot(lambda d, p: mc.q_value_max(observation=(d, p))) game_plots.plot_image(lambda d, p: mc.q_max(observation=(d, p)))
def test_game_step_hit(self): game = Easy21() game.reset() dealer_before = game.score_dealer player_before = game.score_player (dealer, player), reward, done, _ = game.step(action=1) self.assertEqual(done, False) self.assertEqual(reward, 0) self.assertEqual(dealer_before, dealer) self.assertNotEqual(player_before, player)
def test_game(self): env = Easy21() env.reset() while True: env.render() (dealer, player), reward, done, _ = env.step( action=env.action_space.sample()) if done: break env.render() self.assertIn(reward, [-1, 0, 1]) self.__assert_scores(dealer, player, reward)
def test_epsilon_greedy_policy_1M_save(self): game = Easy21() policy = EpsilonGreedyActionPolicy(game.action_space) mc = MonteCarloPolicyEvaluation(env=game, policy=policy) mc.learn(total_timesteps=1_000_000) mc.save("greedy1M")