def test_policy_update(self): np.random.seed(643674) initial_state = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0]], dtype=np.int) initial_obs = np.array([[2, 0, 0], [1, 0, 0], [0, 0, 0]], dtype=np.int) env = MockEnv(3) model = TableModel(env) greedy_policy = GreedyPolicy(model) policy = MockPolicy([EAST, EAST, SOUTH, WEST, WEST, CLEAN]) mc = AveragingMC(env, model, policy) mc.run_episode() self.assertEqual(mc.metrics.first_time_visited, 6) self.assertEqual(mc.metrics.fifth_time_visited, 0) self.assertEqual(mc.metrics.episode_reward, 12) self.assertEqual(mc.metrics.max_action_value_delta, 0) self.assertEqual(greedy_policy.choose_action(initial_obs), EAST) mock_policy = MockPolicy([SOUTH, CLEAN]) mc.policy = mock_policy mc.run_episode() self.assertEqual(mc.metrics.first_time_visited, 7) self.assertEqual(mc.metrics.fifth_time_visited, 0) self.assertEqual(mc.metrics.episode_reward, 16) self.assertEqual(mc.metrics.max_action_value_delta, 2.0) self.assertEqual(greedy_policy.choose_action(initial_obs), SOUTH)
def __init__(self): super().__init__() self.env = CleanBotEnv(4) self.model = TableModel(self.env) self.training_policy = EpsilonGreedyPolicy(self.model, 0.1) self.testing_policy = GreedyPolicy(self.model) self.method = Sarsa(self.env, self.model, self.training_policy) self.training_policy.exploration = 0.1 self.env.max_steps = 32 self.method.alpha = 0.01
def __init__(self): super().__init__() self.env = CleanBotEnv(4) self.model = KerasModel(self.env, model=conv1_model(self.env), batch_size=64) self.training_policy = EpsilonGreedyPolicy(self.model, 0.1) self.testing_policy = GreedyPolicy(self.model) self.method = Sarsa(self.env, self.model, self.training_policy) self.training_policy.exploration = 0.1 self.env.max_steps = 32 self.method.alpha = 0.01 self.model.epochs = 60
def __init__(self, epochs, alpha, batch_size): super().__init__() self.env = CleanBotEnv(4) self.model = KerasModel(self.env, model=conv1_model(self.env), batch_size=batch_size) self.training_policy = EpsilonGreedyPolicy(self.model, 0.1) self.testing_policy = GreedyPolicy(self.model) self.method = AlphaMC(self.env, self.model, self.training_policy) self.name = f"{type(self).__name__}-{self.batch_size:03}-{self.model.epochs:03}-{self.method.alpha:.2f}" self.training_policy.exploration = 0.1 self.env.max_steps = 32 self.method.alpha = alpha self.model.epochs = epochs self.batch_size = batch_size
def test_policy_update(self): np.random.seed(643674) initial_state = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0]], dtype=np.int) initial_obs = np.array([[2, 0, 0], [1, 0, 0], [0, 0, 0]], dtype=np.int) env = MockEnv(3) model = TableModel(env) greedy_policy = GreedyPolicy(model) policy = MockPolicy([EAST, EAST, SOUTH, WEST, WEST, CLEAN]) mc = AlphaMC(env, model, policy) mc.alpha = 0.005 mc.run_episode() self.assertEqual(mc.metrics.episode_reward, 12) self.assertAlmostEqual(mc.metrics.max_action_value_delta, 0.06) self.assertEqual(greedy_policy.choose_action(initial_obs), EAST) policy = MockPolicy([EAST, EAST, SOUTH, WEST, WEST, CLEAN]) mc = AlphaMC(env, model, policy) mc.run_episode() self.assertEqual(mc.metrics.episode_reward, 12) self.assertAlmostEqual(mc.metrics.max_action_value_delta, 0.05970000000670552) self.assertEqual(greedy_policy.choose_action(initial_obs), EAST) mock_policy = MockPolicy([SOUTH, CLEAN]) mc.policy = mock_policy mc.run_episode() self.assertEqual(mc.metrics.episode_reward, 16) self.assertEqual(mc.metrics.max_action_value_delta, 0.08) self.assertEqual(greedy_policy.choose_action(initial_obs), EAST) mock_policy = MockPolicy([SOUTH, CLEAN]) mc.policy = mock_policy mc.run_episode() self.assertEqual(mc.metrics.episode_reward, 16) self.assertEqual(mc.metrics.max_action_value_delta, 0.0796000000089407) self.assertEqual(greedy_policy.choose_action(initial_obs), SOUTH)