def setUp(self):
        self.data = [Sample(np.array([0]), 0, 1, np.array([0])),
                     Sample(np.array([1]), 0, -1, np.array([1]))]

        self.basis = ExactBasis([2], 1)
        self.policy = Policy(self.basis,
                             .9,
                             0,
                             np.zeros((2, )),
                             Policy.TieBreakingStrategy.FirstWins)
Example #2
0
    def test_solver_uses_policy_and_data(self):
        """Test that the solver is passed the data and policy."""

        data = [10]
        initial_policy = Policy(FakeBasis(1))

        solver_stub = SolverParamStub(data, initial_policy)

        lspi.learn(solver_stub.data,
                   solver_stub.policy,
                   solver_stub,
                   max_iterations=1)
Example #3
0
    def __init__(self, env, steps_per_episode):
        self.env = env
        self.obs_space = self.env.observation_space
        self.obs_space_size = self.env.observation_space.shape[0]
        self.loadNoSample = self.env.env.stabilization

        self.stab = True
        if self.stab:
            self.obs_space_size = self.env.observation_space.shape[0]-1

        # For memory initialisation
        self.starting_episodes = 10
        self.starting_steps = steps_per_episode
        self.max_size = 100000
        self.memory = LSPIMemory(self.max_size)

        # define action_space
        self.min_vol = -9.0  # Minimal voltage to apply
        self.max_vol = 9.0  # Maximal voltage to apply
        self.size_action_space = 3
        self.action_space = np.arange(
            self.min_vol, self.max_vol+1,
            (self.max_vol-self.min_vol)/(self.size_action_space-1), dtype=float)  # Discrete action space
        self.action_space = torch.tensor(self.action_space)
        print(self.action_space)

        # Define Basis Function
        self.number_means = 3
        self.rbf_means = self.number_means * self.obs_space_size
        self.gamma = 0.8
        self.lstdq = LSTDQ(self.gamma, self.obs_space_size, self.stab)

        self.initial_policy = Policy(self.action_space, self.obs_space_size, self.number_means, self.obs_space,
                                     self.stab)
        self.changing_policy = copy(self.initial_policy)
        self.epsilon = 0.0001

        # Using the policy
        self.test_episodes = 200
        self.test_timesteps = steps_per_episode
        self.still_learning = False
        self.old_weights = torch.Tensor([])

        # For Plotting and evaluation
        self.rew_overall = 0
        self.rew_episodes_len = 10
        self.rew_episodes = np.zeros(self.rew_episodes_len)
        self.rwd_episodes_array = np.empty(0)
        self.rwd_overall_array = np.empty(0)
Example #4
0
    def test_epsilon_stopping_condition(self):
        """Test if learning stops when distance is less than epsilon."""

        with self.assertRaises(ValueError):
            lspi.learn(None, None, None, epsilon=0)

        epsilon_solver = EpsilonSolverStub(10**-21)

        lspi.learn(None,
                   Policy(FakeBasis(1)),
                   epsilon_solver,
                   epsilon=10**-20,
                   max_iterations=1000)

        self.assertEqual(epsilon_solver.num_calls, 1)
Example #5
0
    def test_max_iterations_stopping_condition(self):
        """Test if learning stops when max_iterations is reached."""

        with self.assertRaises(ValueError):
            lspi.learn(None, None, None, max_iterations=0)

        max_iterations_solver = MaxIterationsSolverStub()

        lspi.learn(None,
                   Policy(FakeBasis(1)),
                   max_iterations_solver,
                   epsilon=10**-200,
                   max_iterations=10)

        self.assertEqual(max_iterations_solver.num_calls, 10)
Example #6
0
    def test_returns_policy_with_new_weights(self):
        """Test if the weights in the new policy differ and are not the same underlying numpy vector."""

        initial_policy = Policy(FakeBasis(1))

        weight_solver = WeightSolverStub(initial_policy.weights)

        new_policy = lspi.learn(None,
                                initial_policy,
                                weight_solver,
                                max_iterations=1)

        self.assertEqual(weight_solver.num_calls, 1)
        self.assertFalse(
            np.may_share_memory(initial_policy.weights, new_policy))
        self.assertNotEquals(id(initial_policy), id(new_policy))
        np.testing.assert_array_almost_equal(new_policy.weights,
                                             weight_solver.weights)
class TestPolicy(TestCase):

    def create_policy(self, *args, **kwargs):
        return Policy(FakeBasis(5), *args, **kwargs)

    @staticmethod
    def list_has_duplicates(list, num_places=4):
        # verify that there are no duplicate q values.
        # round the q_values so that there are not small floating point
        # inconsistencies that lead to no duplicates being detected
        # Then make a set of the list. If there are no duplicates then the
        # cardinality of the set will match the length of the list
        rounded_list = map(lambda x: round(x, 4), list)
        return len(set(rounded_list)) < len(list)

    def setUp(self):
        self.poly_policy = Policy(OneDimensionalPolynomialBasis(1, 2),
                                  weights=np.array([1., 1, 2, 2]))
        self.state = np.array([-3.])
        self.tie_weights = np.ones((4,))

    def test_default_constructor(self):
        policy = self.create_policy()

        self.assertTrue(isinstance(policy.basis, FakeBasis))
        self.assertAlmostEqual(policy.discount, 1.0)
        self.assertAlmostEqual(policy.explore, 0.0)
        self.assertEqual(policy.weights.shape, (1,))
        self.assertEqual(policy.tie_breaking_strategy,
                         Policy.TieBreakingStrategy.RandomWins)

    def test_full_constructor(self):
        policy = self.create_policy(.5, .1, np.array([1.]),
                                    Policy.TieBreakingStrategy.FirstWins)

        self.assertTrue(isinstance(policy.basis, FakeBasis))
        self.assertAlmostEqual(policy.discount, .5)
        self.assertAlmostEqual(policy.explore, 0.1)
        np.testing.assert_array_almost_equal(policy.weights, np.array([1.]))
        self.assertEqual(policy.tie_breaking_strategy,
                         Policy.TieBreakingStrategy.FirstWins)

    def test_discount_out_of_bounds(self):
        with self.assertRaises(ValueError):
            self.create_policy(discount=-1.0)

        with self.assertRaises(ValueError):
            self.create_policy(discount=1.1)

    def test_explore_out_of_bounds(self):
        with self.assertRaises(ValueError):
            self.create_policy(explore=-.01)

        with self.assertRaises(ValueError):
            self.create_policy(explore=1.1)

    def test_weight_basis_dimensions_mismatch(self):
        with self.assertRaises(ValueError):
            self.create_policy(weights=np.arange(2))

    def test_copy(self):
        orig_policy = self.create_policy()
        policy_copy = copy(orig_policy)

        self.assertNotEqual(id(orig_policy), id(policy_copy))
        self.assertEqual(orig_policy.num_actions,
                         policy_copy.num_actions)
        self.assertEqual(orig_policy.discount, policy_copy.discount)
        self.assertEqual(orig_policy.explore, policy_copy.explore)
        np.testing.assert_array_almost_equal(orig_policy.weights,
                                             policy_copy.weights)

        self.assertNotEqual(id(orig_policy.weights), id(policy_copy.weights))

        # verify that changing a weight in the original doesn't affect the copy
        orig_policy.weights[0] *= -1

        # numpy doesn't have an assert if not equal method
        # so to do the inverse I'm asserting the two arrays are equal
        # and expecting the assertion to fail
        with self.assertRaises(AssertionError):
            np.testing.assert_array_almost_equal(orig_policy.weights,
                                                 policy_copy.weights)

    def test_calc_q_value_unit_weights(self):
        q_value = self.poly_policy.calc_q_value(self.state, 0)
        self.assertAlmostEqual(q_value, -2.)

    def test_calc_q_value_non_unit_weights(self):
        q_value = self.poly_policy.calc_q_value(self.state, 1)
        self.assertAlmostEqual(q_value, -4.)

    def test_calc_q_value_negative_action(self):
        with self.assertRaises(IndexError):
            self.poly_policy.calc_q_value(self.state, -1)

    def test_calc_q_value_out_of_bounds_action(self):
        with self.assertRaises(IndexError):
            self.poly_policy.calc_q_value(self.state, 2)

    def test_calc_q_value_mismatched_state_dimensions(self):
        with self.assertRaises(ValueError):
            self.poly_policy.calc_q_value(np.ones((2,)), 0)

    def test_best_action_no_ties(self):

        q_values = [self.poly_policy.calc_q_value(self.state, action)
            for action in range(self.poly_policy.num_actions)]

        self.assertFalse(TestPolicy.list_has_duplicates(q_values))

        best_action = self.poly_policy.best_action(self.state)
        self.assertEqual(best_action, 0)

    def test_best_action_with_ties_first_wins(self):
        self.poly_policy.weights = self.tie_weights
        self.poly_policy.tie_breaking_strategy = \
            Policy.TieBreakingStrategy.FirstWins

        q_values = [self.poly_policy.calc_q_value(self.state, action)
            for action in range(self.poly_policy.num_actions)]

        self.assertTrue(TestPolicy.list_has_duplicates(q_values))

        best_action = self.poly_policy.best_action(self.state)
        self.assertEqual(best_action, 0)

    def test_best_action_with_ties_last_wins(self):
        self.poly_policy.weights = self.tie_weights
        self.poly_policy.tie_breaking_strategy = \
            Policy.TieBreakingStrategy.LastWins

        q_values = [self.poly_policy.calc_q_value(self.state, action)
            for action in range(self.poly_policy.num_actions)]

        self.assertTrue(TestPolicy.list_has_duplicates(q_values))

        best_action = self.poly_policy.best_action(self.state)
        self.assertEqual(best_action, 1)

    def test_best_action_with_ties_random_wins(self):
        self.poly_policy.weights = self.tie_weights
        self.poly_policy.tie_breaking_strategy = \
            Policy.TieBreakingStrategy.RandomWins

        q_values = [self.poly_policy.calc_q_value(self.state, action)
            for action in range(self.poly_policy.num_actions)]

        self.assertTrue(TestPolicy.list_has_duplicates(q_values))

        # select the best action num_times times
        num_times = 10
        best_actions = [self.poly_policy.best_action(self.state)
                        for i in range(num_times)]

        # This test will fail if all of the actions selected either action 0
        # or action 1. When all action 0 is selected the sum will be
        # equal to 0. When all action 1 is taken the sum will be equal to
        # num_times
        self.assertLess(int(sum(best_actions)), num_times)
        self.assertNotEqual(int(sum(best_actions)), 0)

    def test_best_action_mismatched_state_dimensions(self):
        with self.assertRaises(ValueError):
            self.poly_policy.best_action(np.ones((2,)))

    def test_select_action_random(self):
        # first verify there are no ties
        # this way we know the tie breaking strategy isn't introducing
        # the randomness
        q_values = [self.poly_policy.calc_q_value(self.state, action)
            for action in range(self.poly_policy.num_actions)]

        self.assertFalse(TestPolicy.list_has_duplicates(q_values))

        self.poly_policy.explore = 1.0
        self.poly_policy.tie_breaking_strategy = \
            Policy.TieBreakingStrategy.FirstWins

        # this is set up to evaluate to no tie
        num_times = 10
        best_actions = [self.poly_policy.select_action(self.state)
                        for i in range(num_times)]

        self.assertNotEqual(sum(best_actions), 0)
        self.assertNotEqual(sum(best_actions), num_times)

    def test_select_action_deterministic(self):
        # first verify there are no ties
        # this way we know the tie breaking strategy isn't introducing
        # the randomness
        q_values = [self.poly_policy.calc_q_value(self.state, action)
            for action in range(self.poly_policy.num_actions)]

        self.assertFalse(TestPolicy.list_has_duplicates(q_values))

        self.poly_policy.explore = 0.0
        self.poly_policy.tie_breaking_strategy = \
            Policy.TieBreakingStrategy.FirstWins

        # this is set up to evaluate to no tie
        num_times = 10
        best_actions = [self.poly_policy.select_action(self.state)
                        for i in range(num_times)]
        self.assertEqual(sum(best_actions), 0)

    def test_select_action_mismatched_state_dimensions(self):
        with self.assertRaises(ValueError):
            self.poly_policy.select_action(np.ones((2,)))

    def test_num_actions_getter(self):
        self.assertEqual(self.poly_policy.num_actions,
                         self.poly_policy.basis.num_actions)

        self.poly_policy.basis.num_actions = 10

        self.assertEqual(self.poly_policy.num_actions,
                         self.poly_policy.basis.num_actions)

    def test_num_actions_setter(self):
        self.assertEqual(self.poly_policy.num_actions,
                         self.poly_policy.basis.num_actions)

        self.poly_policy.num_actions = 10

        self.assertEqual(self.poly_policy.num_actions,
                         self.poly_policy.basis.num_actions)
 def setUp(self):
     self.poly_policy = Policy(OneDimensionalPolynomialBasis(1, 2),
                               weights=np.array([1., 1, 2, 2]))
     self.state = np.array([-3.])
     self.tie_weights = np.ones((4,))
 def create_policy(self, *args, **kwargs):
     return Policy(FakeBasis(5), *args, **kwargs)
Example #10
0
from lspi.domains import Domain, ChainDomain
from lspi.solvers import LSTDQSolver
from lspi.policy import Policy
from lspi.sample import Sample
from lspi.basis_functions import FakeBasis, OneDimensionalPolynomialBasis

if __name__ == '__main__':
  # data = [
  #   Sample(np.array([0]), 0, 1, np.array([0])),
  #   Sample(np.array([1]), 0, -1, np.array([1]), True)
  # ]

  precondition_value = .3
  initial_policy = Policy(OneDimensionalPolynomialBasis(3,2), .9, 0, tie_breaking_strategy=Policy.TieBreakingStrategy.FirstWins)
  # initial_policy = Policy(lspi.basis_functions.RadialBasisFunction(np.array([[0], [2], [4], [6], [8]]), .5, 2), .9, 0)
  sampling_policy = Policy(FakeBasis(2), .9, 1)
  solver = LSTDQSolver(precondition_value)
  # weights = solver.solve(data[:-1], initial_policy)
  domain = ChainDomain()
  samples = []

  for i in range(1000):
    action = sampling_policy.select_action(domain.current_state())
    samples.append(domain.apply_action(action))

  learned_policy = lspi.learn(samples, initial_policy, solver)

  domain.reset()

  cumulative_reward = 0