def __init__(self, alpha, beta, epsilon_bpolicy, epsilon_tpolicy, gamma, n, sigma, dim_out, fully_connected_layers): self.env = MountainCar() self.tpolicy = EpsilonGreedyPolicy( initial_epsilon=epsilon_tpolicy, numActions=self.env.get_num_actions()) self.bpolicy = EpsilonGreedyPolicy( initial_epsilon=epsilon_bpolicy, numActions=self.env.get_num_actions()) " Model Parameters " name = "experiment" observation_dimensions = self.env.get_observation_dimensions() num_actions = self.env.get_num_actions() gate_fun = tf.nn.relu self.model = Model_mFO(name=name, dim_out=dim_out, observation_dimensions=observation_dimensions, num_actions=num_actions, gate_fun=gate_fun, fully_connected_layers=fully_connected_layers) " Neural Network Parameters " batch_size = 1 self.tf_session = tf.Session() number_of_percentiles = 0 percentile_index = 0 training_steps = 1 self.fa = NeuralNetwork_wPrioritizedTraining( model=self.model, optimizer=tf.train.GradientDescentOptimizer, numActions=num_actions, batch_size=batch_size, alpha=alpha, tf_session=self.tf_session, observation_dimensions=observation_dimensions, layer_training_print_freq=5000000, number_of_percentiles=number_of_percentiles, training_steps=training_steps, percentile_to_train_index=percentile_index) " Agent Parameters " self.agent = QSigma(n=n, gamma=gamma, beta=beta, sigma=sigma, environment=self.env, function_approximator=self.fa, target_policy=self.tpolicy, behaviour_policy=self.bpolicy) self.agent_parameters = { "beta": beta, "gamma": gamma, "n": n, "bpolicy": self.bpolicy, "tpolicy": self.tpolicy }
def __init__(self, alpha, numTilings, beta, epsilon_bpolicy, epsilon_tpolicy, gamma, n, sigma): self.env = MountainCar() self.fa = TileCoderFA(numTilings=numTilings, numActions=self.env.get_num_actions(), alpha=alpha, state_space_range=(self.env.get_high() - self.env.get_low()), state_space_size=len(self.env.get_current_state()), tile_side_length=10) self.tpolicy = EpsilonGreedyPolicy(initial_epsilon=epsilon_tpolicy, numActions=self.env.get_num_actions()) self.bpolicy = EpsilonGreedyPolicy(initial_epsilon=epsilon_bpolicy, numActions=self.env.get_num_actions()) self.agent = QSigma(n=n, gamma=gamma, beta=beta, sigma=sigma, environment=self.env, function_approximator=self.fa, target_policy=self.tpolicy, behaviour_policy=self.bpolicy)
def reset_agent(self): self.env = MountainCar() for var in tf.global_variables(): self.tf_session.run(var.initializer) self.agent = QSigma(beta=self.agent_parameters["beta"], gamma=self.agent_parameters["gamma"], n=self.agent_parameters["n"], behaviour_policy=self.agent_parameters["bpolicy"], target_policy=self.agent_parameters["tpolicy"], environment=self.env, function_approximator=self.fa)
def setUp(self): config = Config() self.env = MountainCar(config) " Target Policy Parameters " config.num_actions = self.env.get_num_actions() config.target_policy = Config() config.target_policy.num_actions = self.env.get_num_actions() config.target_policy.initial_epsilon = 0.1 config.target_policy.anneal_epsilon = False " FA Parameters " config.num_tilings = 32 config.tiling_side_length = 8 config.num_actions = 3 config.num_dims = 2 config.alpha = (1/4) / 32 self.tpolicy = EpsilonGreedyPolicy(config, behaviour_policy=False) ### Test 1 Setup ### self.fa1 = TileCoderFA(config) config.behaviour_policy = config.target_policy self.bpolicy = EpsilonGreedyPolicy(config, behaviour_policy=True) config1 = Config() config1.n = 4 config1.gamma = 1 config1.beta = 1 config1.sigma = 0.5 config1.save_summary = True self.summary = {} self.agent1 = QSigma(config=config1, environment=self.env, function_approximator=self.fa1, target_policy=self.tpolicy, behaviour_policy=self.bpolicy, summary=self.summary) ### Test 2 Setup ### config.behaviour_policy = Config() config.behaviour_policy.initial_epsilon = 0.5 config.behaviour_policy.final_epsilon = 0.1 config.behaviour_policy.annealing_period = 10000 config.behaviour_policy.anneal_epsilon = True config2 = Config() config2.n = 3 config2.gamma = 1 config2.beta = 1 config2.sigma = 0.5 self.bpolicy2 = EpsilonGreedyPolicy(config, behaviour_policy=True) self.fa2 = TileCoderFA(config) self.agent2 = QSigma(config=config2, environment=self.env, function_approximator=self.fa2, target_policy=self.tpolicy, behaviour_policy=self.bpolicy2) ### Test 3 Setup ### config.behaviour_policy = Config() config.behaviour_policy.initial_epsilon = 1 config.behaviour_policy.final_epsilon = 0.1 config.behaviour_policy.anneal_epsilon = True config3 = Config() config3.n = 3 config3.gamma = 1 config3.beta = 1 config3.sigma = 0.5 config3.initial_rand_steps = 5000 config3.rand_steps_count = 0 self.bpolicy3 = EpsilonGreedyPolicy(config, behaviour_policy=True) self.fa3 = TileCoderFA(config) self.agent3 = QSigma(config=config3, environment=self.env, function_approximator=self.fa3, target_policy=self.tpolicy, behaviour_policy=self.bpolicy3)
class Test_MountainCar_Environment(unittest.TestCase): def setUp(self): config = Config() self.env = MountainCar(config) " Target Policy Parameters " config.num_actions = self.env.get_num_actions() config.target_policy = Config() config.target_policy.num_actions = self.env.get_num_actions() config.target_policy.initial_epsilon = 0.1 config.target_policy.anneal_epsilon = False " FA Parameters " config.num_tilings = 32 config.tiling_side_length = 8 config.num_actions = 3 config.num_dims = 2 config.alpha = (1/4) / 32 self.tpolicy = EpsilonGreedyPolicy(config, behaviour_policy=False) ### Test 1 Setup ### self.fa1 = TileCoderFA(config) config.behaviour_policy = config.target_policy self.bpolicy = EpsilonGreedyPolicy(config, behaviour_policy=True) config1 = Config() config1.n = 4 config1.gamma = 1 config1.beta = 1 config1.sigma = 0.5 config1.save_summary = True self.summary = {} self.agent1 = QSigma(config=config1, environment=self.env, function_approximator=self.fa1, target_policy=self.tpolicy, behaviour_policy=self.bpolicy, summary=self.summary) ### Test 2 Setup ### config.behaviour_policy = Config() config.behaviour_policy.initial_epsilon = 0.5 config.behaviour_policy.final_epsilon = 0.1 config.behaviour_policy.annealing_period = 10000 config.behaviour_policy.anneal_epsilon = True config2 = Config() config2.n = 3 config2.gamma = 1 config2.beta = 1 config2.sigma = 0.5 self.bpolicy2 = EpsilonGreedyPolicy(config, behaviour_policy=True) self.fa2 = TileCoderFA(config) self.agent2 = QSigma(config=config2, environment=self.env, function_approximator=self.fa2, target_policy=self.tpolicy, behaviour_policy=self.bpolicy2) ### Test 3 Setup ### config.behaviour_policy = Config() config.behaviour_policy.initial_epsilon = 1 config.behaviour_policy.final_epsilon = 0.1 config.behaviour_policy.anneal_epsilon = True config3 = Config() config3.n = 3 config3.gamma = 1 config3.beta = 1 config3.sigma = 0.5 config3.initial_rand_steps = 5000 config3.rand_steps_count = 0 self.bpolicy3 = EpsilonGreedyPolicy(config, behaviour_policy=True) self.fa3 = TileCoderFA(config) self.agent3 = QSigma(config=config3, environment=self.env, function_approximator=self.fa3, target_policy=self.tpolicy, behaviour_policy=self.bpolicy3) def test_train(self): print("\n############ Testing Training Function ##############") print("Training 50 episodes:") for i in range(50): # print("\tTraining episode:", i+1) self.agent1.train(1) print("\tThe average return after 50 episodes is:", np.average(self.summary['return_per_episode'])) print("Training 450 more episodes:") for i in range(9): print("\tTraining", 50, "more episodes...") self.agent1.train(50) print("\tThe average return after", (i+1) * 50 + 50, "episodes is:", np.average(self.summary['return_per_episode'])) def test_annealing_epsilon(self): print("\n############ Testing Annealing Epsilon ###############") print("The initial epsilon is:", self.agent2.bpolicy.initial_epsilon) print("The final epsilon is:", self.agent2.bpolicy.final_epsilon) print("The annealing period is:", self.agent2.bpolicy.annealing_period) print("Training for 1 episodes...") self.agent2.train(1) print("The current epsilon is:", self.bpolicy2.epsilon) print("The epsilon of the target policy is:", self.tpolicy.epsilon) print("Training for 10 more episodes...") self.agent2.train(10) print("The current epsilon is:", self.bpolicy2.epsilon) print("The epsilon of the target policy is:", self.tpolicy.epsilon) print("Training for 100 more episodes...") self.agent2.train(100) print("The current epsilon is:", self.bpolicy2.epsilon) print("The epsilon of the target policy is:", self.tpolicy.epsilon) def test_steps_before_training(self): print("\n############ Testing Steps Before Training ###############") print("The initial epsilon is:", self.agent3.bpolicy.initial_epsilon) print("The final epsilon is:", self.agent3.bpolicy.final_epsilon) print("The annealing period is:", self.agent3.bpolicy.annealing_period) print("The number of steps before training is:", self.agent3.config.initial_rand_steps) print("The current number of steps before training is:", self.agent3.config.rand_steps_count) print("Training for 1 episodes...") self.agent3.train(1) print("The current epsilon is:", self.bpolicy3.epsilon) print("The epsilon of the target policy is:", self.tpolicy.epsilon) print("The current number of steps before training is:", self.agent3.config.rand_steps_count) print("Training for 10 more episodes...") self.agent3.train(10) print("The current epsilon is:", self.bpolicy3.epsilon) print("The epsilon of the target policy is:", self.tpolicy.epsilon) print("The current number of steps before training is:", self.agent3.config.rand_steps_count) print("Training for 100 more episodes...") self.agent3.train(100) print("The current epsilon is:", self.bpolicy3.epsilon) print("The epsilon of the target policy is:", self.tpolicy.epsilon) print("The current number of steps before training is:", self.agent3.config.rand_steps_count)