Exemple #1
0
    def test_policy_update(self):
        np.random.seed(643674)
        initial_state = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0]],
                                 dtype=np.int)
        initial_obs = np.array([[2, 0, 0], [1, 0, 0], [0, 0, 0]], dtype=np.int)
        env = MockEnv(3)
        model = TableModel(env)
        greedy_policy = GreedyPolicy(model)

        policy = MockPolicy([EAST, EAST, SOUTH, WEST, WEST, CLEAN])
        mc = AveragingMC(env, model, policy)
        mc.run_episode()

        self.assertEqual(mc.metrics.first_time_visited, 6)
        self.assertEqual(mc.metrics.fifth_time_visited, 0)
        self.assertEqual(mc.metrics.episode_reward, 12)
        self.assertEqual(mc.metrics.max_action_value_delta, 0)
        self.assertEqual(greedy_policy.choose_action(initial_obs), EAST)

        mock_policy = MockPolicy([SOUTH, CLEAN])
        mc.policy = mock_policy
        mc.run_episode()

        self.assertEqual(mc.metrics.first_time_visited, 7)
        self.assertEqual(mc.metrics.fifth_time_visited, 0)
        self.assertEqual(mc.metrics.episode_reward, 16)
        self.assertEqual(mc.metrics.max_action_value_delta, 2.0)
        self.assertEqual(greedy_policy.choose_action(initial_obs), SOUTH)
Exemple #2
0
    def __init__(self):
        super().__init__()
        self.env = CleanBotEnv(4)
        self.model = TableModel(self.env)
        self.training_policy = EpsilonGreedyPolicy(self.model, 0.1)
        self.testing_policy = GreedyPolicy(self.model)
        self.method = Sarsa(self.env, self.model, self.training_policy)

        self.training_policy.exploration = 0.1
        self.env.max_steps = 32
        self.method.alpha = 0.01
Exemple #3
0
    def __init__(self):
        super().__init__()
        self.env = CleanBotEnv(4)
        self.model = KerasModel(self.env, model=conv1_model(self.env), batch_size=64)
        self.training_policy = EpsilonGreedyPolicy(self.model, 0.1)
        self.testing_policy = GreedyPolicy(self.model)
        self.method = Sarsa(self.env, self.model, self.training_policy)

        self.training_policy.exploration = 0.1
        self.env.max_steps = 32
        self.method.alpha = 0.01
        self.model.epochs = 60
Exemple #4
0
    def __init__(self, epochs, alpha, batch_size):
        super().__init__()
        self.env = CleanBotEnv(4)
        self.model = KerasModel(self.env,
                                model=conv1_model(self.env),
                                batch_size=batch_size)
        self.training_policy = EpsilonGreedyPolicy(self.model, 0.1)
        self.testing_policy = GreedyPolicy(self.model)
        self.method = AlphaMC(self.env, self.model, self.training_policy)
        self.name = f"{type(self).__name__}-{self.batch_size:03}-{self.model.epochs:03}-{self.method.alpha:.2f}"

        self.training_policy.exploration = 0.1
        self.env.max_steps = 32
        self.method.alpha = alpha
        self.model.epochs = epochs
        self.batch_size = batch_size
Exemple #5
0
    def test_policy_update(self):
        np.random.seed(643674)
        initial_state = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0]],
                                 dtype=np.int)
        initial_obs = np.array([[2, 0, 0], [1, 0, 0], [0, 0, 0]], dtype=np.int)
        env = MockEnv(3)
        model = TableModel(env)
        greedy_policy = GreedyPolicy(model)

        policy = MockPolicy([EAST, EAST, SOUTH, WEST, WEST, CLEAN])
        mc = AlphaMC(env, model, policy)
        mc.alpha = 0.005
        mc.run_episode()

        self.assertEqual(mc.metrics.episode_reward, 12)
        self.assertAlmostEqual(mc.metrics.max_action_value_delta, 0.06)
        self.assertEqual(greedy_policy.choose_action(initial_obs), EAST)

        policy = MockPolicy([EAST, EAST, SOUTH, WEST, WEST, CLEAN])
        mc = AlphaMC(env, model, policy)
        mc.run_episode()

        self.assertEqual(mc.metrics.episode_reward, 12)
        self.assertAlmostEqual(mc.metrics.max_action_value_delta,
                               0.05970000000670552)
        self.assertEqual(greedy_policy.choose_action(initial_obs), EAST)

        mock_policy = MockPolicy([SOUTH, CLEAN])
        mc.policy = mock_policy
        mc.run_episode()

        self.assertEqual(mc.metrics.episode_reward, 16)
        self.assertEqual(mc.metrics.max_action_value_delta, 0.08)
        self.assertEqual(greedy_policy.choose_action(initial_obs), EAST)

        mock_policy = MockPolicy([SOUTH, CLEAN])
        mc.policy = mock_policy
        mc.run_episode()

        self.assertEqual(mc.metrics.episode_reward, 16)
        self.assertEqual(mc.metrics.max_action_value_delta, 0.0796000000089407)
        self.assertEqual(greedy_policy.choose_action(initial_obs), SOUTH)