def __init__(self, alpha, beta, epsilon_bpolicy, epsilon_tpolicy, gamma, n,
                 sigma, dim_out, fully_connected_layers):
        self.env = MountainCar()
        self.tpolicy = EpsilonGreedyPolicy(
            initial_epsilon=epsilon_tpolicy,
            numActions=self.env.get_num_actions())
        self.bpolicy = EpsilonGreedyPolicy(
            initial_epsilon=epsilon_bpolicy,
            numActions=self.env.get_num_actions())

        " Model Parameters "
        name = "experiment"
        observation_dimensions = self.env.get_observation_dimensions()
        num_actions = self.env.get_num_actions()
        gate_fun = tf.nn.relu
        self.model = Model_mFO(name=name,
                               dim_out=dim_out,
                               observation_dimensions=observation_dimensions,
                               num_actions=num_actions,
                               gate_fun=gate_fun,
                               fully_connected_layers=fully_connected_layers)

        " Neural Network Parameters "
        batch_size = 1
        self.tf_session = tf.Session()
        number_of_percentiles = 0
        percentile_index = 0
        training_steps = 1
        self.fa = NeuralNetwork_wPrioritizedTraining(
            model=self.model,
            optimizer=tf.train.GradientDescentOptimizer,
            numActions=num_actions,
            batch_size=batch_size,
            alpha=alpha,
            tf_session=self.tf_session,
            observation_dimensions=observation_dimensions,
            layer_training_print_freq=5000000,
            number_of_percentiles=number_of_percentiles,
            training_steps=training_steps,
            percentile_to_train_index=percentile_index)

        " Agent Parameters "
        self.agent = QSigma(n=n,
                            gamma=gamma,
                            beta=beta,
                            sigma=sigma,
                            environment=self.env,
                            function_approximator=self.fa,
                            target_policy=self.tpolicy,
                            behaviour_policy=self.bpolicy)
        self.agent_parameters = {
            "beta": beta,
            "gamma": gamma,
            "n": n,
            "bpolicy": self.bpolicy,
            "tpolicy": self.tpolicy
        }
예제 #2
0
    def __init__(self, alpha, numTilings, beta, epsilon_bpolicy, epsilon_tpolicy, gamma, n, sigma):
        self.env = MountainCar()
        self.fa = TileCoderFA(numTilings=numTilings, numActions=self.env.get_num_actions(), alpha=alpha,
                              state_space_range=(self.env.get_high() - self.env.get_low()),
                              state_space_size=len(self.env.get_current_state()), tile_side_length=10)
        self.tpolicy = EpsilonGreedyPolicy(initial_epsilon=epsilon_tpolicy, numActions=self.env.get_num_actions())
        self.bpolicy = EpsilonGreedyPolicy(initial_epsilon=epsilon_bpolicy, numActions=self.env.get_num_actions())

        self.agent = QSigma(n=n, gamma=gamma, beta=beta, sigma=sigma, environment=self.env,
                            function_approximator=self.fa, target_policy=self.tpolicy, behaviour_policy=self.bpolicy)
 def reset_agent(self):
     self.env = MountainCar()
     for var in tf.global_variables():
         self.tf_session.run(var.initializer)
     self.agent = QSigma(beta=self.agent_parameters["beta"],
                         gamma=self.agent_parameters["gamma"],
                         n=self.agent_parameters["n"],
                         behaviour_policy=self.agent_parameters["bpolicy"],
                         target_policy=self.agent_parameters["tpolicy"],
                         environment=self.env,
                         function_approximator=self.fa)
def main():
    """" Directories and Paths for Saving and Restoring """
    homepath = "/home/jfernando/"
    srcpath = homepath + "PycharmProjects/RL_Experiments/Demos/Seaquest_Test/"
    games_directory = homepath + "PycharmProjects/RL_Experiments/Experiments_Engine/Environments/Arcade_Learning_Environment/Supported_Roms/"
    rom_name = "seaquest.bin"
    experiment_name = "seaquest_test"
    experiment_path = srcpath + experiment_name
    restore = False
    agent_history = NN_Agent_History(experiment_path, restore)

    " Environment "
    config = Config()
    env = ALE_Environment(config,
                          rom_filename=rom_name,
                          games_directory=games_directory)
    observation_dimensions = env.get_observation_dimensions()
    num_actions = env.get_num_actions()

    " Optimizer and TF Session "
    sess = tf.Session()
    optimizer = tf.train.GradientDescentOptimizer

    if restore:
        " Dictionaries "
        agent_dictionary = agent_history.load_nn_agent_dictionary()
        env_dictionary = agent_history.load_nn_agent_environment_dictionary()
        model_dictionary = agent_history.load_nn_agent_model_dictionary()
        fa_dictionary = agent_history.load_nn_agent_fa_dictionary()

        env.set_environment_dictionary(env_dictionary)
        model = models.Model_nCPmFO(model_dictionary=model_dictionary)
        fa = NeuralNetwork_wPrioritizedTraining(neural_network=model,
                                                optimizer=optimizer,
                                                fa_dictionary=fa_dictionary,
                                                tf_session=sess)
        agent = QSigma(environment=env,
                       function_approximator=fa,
                       agent_dictionary=agent_dictionary)
        restore_graph(experiment_path, sess)
    else:
        " Agent variables "
        tpolicy = EpsilonGreedyPolicy(env.get_num_actions(),
                                      initial_epsilon=0.1)
        bpolicy = EpsilonGreedyPolicy(env.get_num_actions(),
                                      initial_epsilon=0.1)
        gamma = 0.99
        n = 5
        beta = 1
        sigma = 0.5

        " Model Variables "
        name = experiment_name
        dim_out = [32, 64, 64, 512]
        gate_fun = tf.nn.relu
        conv_layers = 3
        filter_dims = [8, 4, 3]
        fully_connected_layers = 1

        " FA variables "
        batch_size = 1
        alpha = 0.000001
        strides = [4, 2, 1]
        model = models.Model_nCPmFO(
            name=name,
            dim_out=dim_out,
            observation_dimensions=observation_dimensions,
            num_actions=num_actions,
            gate_fun=gate_fun,
            convolutional_layers=conv_layers,
            filter_dims=filter_dims,
            fully_connected_layers=fully_connected_layers,
            strides=strides)
        fa = NeuralNetwork_wPrioritizedTraining(
            neural_network=model,
            optimizer=optimizer,
            numActions=num_actions,
            batch_size=batch_size,
            alpha=alpha,
            tf_session=sess,
            observation_dimensions=observation_dimensions,
            number_of_percentiles=0)
        agent = QSigma(n=n,
                       gamma=gamma,
                       beta=beta,
                       sigma=sigma,
                       environment=env,
                       function_approximator=fa,
                       target_policy=tpolicy,
                       behaviour_policy=bpolicy)

    while env.frame_count < 50000:
        training_loop(rl_agent=agent,
                      iterations=1,
                      episodes_per_iteration=1,
                      render=True,
                      agent_render=False,
                      final_epsilon=0.1,
                      bpolicy_frames_before_target=100,
                      decrease_epsilon=True)

    save_graph(experiment_path, sess)
    agent_history.save_training_history(experiment_path, agent)
    def setUp(self):
        config = Config()
        self.env = MountainCar(config)

        " Target Policy Parameters "
        config.num_actions = self.env.get_num_actions()
        config.target_policy = Config()
        config.target_policy.num_actions = self.env.get_num_actions()
        config.target_policy.initial_epsilon = 0.1
        config.target_policy.anneal_epsilon = False

        " FA Parameters "
        config.num_tilings = 32
        config.tiling_side_length = 8
        config.num_actions = 3
        config.num_dims = 2
        config.alpha = (1/4) / 32

        self.tpolicy = EpsilonGreedyPolicy(config, behaviour_policy=False)

        ### Test 1 Setup ###
        self.fa1 = TileCoderFA(config)
        config.behaviour_policy = config.target_policy
        self.bpolicy = EpsilonGreedyPolicy(config, behaviour_policy=True)

        config1 = Config()
        config1.n = 4
        config1.gamma = 1
        config1.beta = 1
        config1.sigma = 0.5
        config1.save_summary = True

        self.summary = {}
        self.agent1 = QSigma(config=config1, environment=self.env, function_approximator=self.fa1,
                             target_policy=self.tpolicy, behaviour_policy=self.bpolicy, summary=self.summary)

        ### Test 2 Setup ###
        config.behaviour_policy = Config()
        config.behaviour_policy.initial_epsilon = 0.5
        config.behaviour_policy.final_epsilon = 0.1
        config.behaviour_policy.annealing_period = 10000
        config.behaviour_policy.anneal_epsilon = True

        config2 = Config()
        config2.n = 3
        config2.gamma = 1
        config2.beta = 1
        config2.sigma = 0.5
        self.bpolicy2 = EpsilonGreedyPolicy(config, behaviour_policy=True)
        self.fa2 = TileCoderFA(config)
        self.agent2 = QSigma(config=config2, environment=self.env, function_approximator=self.fa2,
                             target_policy=self.tpolicy, behaviour_policy=self.bpolicy2)

        ### Test 3 Setup ###
        config.behaviour_policy = Config()
        config.behaviour_policy.initial_epsilon = 1
        config.behaviour_policy.final_epsilon = 0.1
        config.behaviour_policy.anneal_epsilon = True

        config3 = Config()
        config3.n = 3
        config3.gamma = 1
        config3.beta = 1
        config3.sigma = 0.5
        config3.initial_rand_steps = 5000
        config3.rand_steps_count = 0
        self.bpolicy3 = EpsilonGreedyPolicy(config, behaviour_policy=True)
        self.fa3 = TileCoderFA(config)
        self.agent3 = QSigma(config=config3, environment=self.env, function_approximator=self.fa3,
                             target_policy=self.tpolicy, behaviour_policy=self.bpolicy3)
class Test_MountainCar_Environment(unittest.TestCase):

    def setUp(self):
        config = Config()
        self.env = MountainCar(config)

        " Target Policy Parameters "
        config.num_actions = self.env.get_num_actions()
        config.target_policy = Config()
        config.target_policy.num_actions = self.env.get_num_actions()
        config.target_policy.initial_epsilon = 0.1
        config.target_policy.anneal_epsilon = False

        " FA Parameters "
        config.num_tilings = 32
        config.tiling_side_length = 8
        config.num_actions = 3
        config.num_dims = 2
        config.alpha = (1/4) / 32

        self.tpolicy = EpsilonGreedyPolicy(config, behaviour_policy=False)

        ### Test 1 Setup ###
        self.fa1 = TileCoderFA(config)
        config.behaviour_policy = config.target_policy
        self.bpolicy = EpsilonGreedyPolicy(config, behaviour_policy=True)

        config1 = Config()
        config1.n = 4
        config1.gamma = 1
        config1.beta = 1
        config1.sigma = 0.5
        config1.save_summary = True

        self.summary = {}
        self.agent1 = QSigma(config=config1, environment=self.env, function_approximator=self.fa1,
                             target_policy=self.tpolicy, behaviour_policy=self.bpolicy, summary=self.summary)

        ### Test 2 Setup ###
        config.behaviour_policy = Config()
        config.behaviour_policy.initial_epsilon = 0.5
        config.behaviour_policy.final_epsilon = 0.1
        config.behaviour_policy.annealing_period = 10000
        config.behaviour_policy.anneal_epsilon = True

        config2 = Config()
        config2.n = 3
        config2.gamma = 1
        config2.beta = 1
        config2.sigma = 0.5
        self.bpolicy2 = EpsilonGreedyPolicy(config, behaviour_policy=True)
        self.fa2 = TileCoderFA(config)
        self.agent2 = QSigma(config=config2, environment=self.env, function_approximator=self.fa2,
                             target_policy=self.tpolicy, behaviour_policy=self.bpolicy2)

        ### Test 3 Setup ###
        config.behaviour_policy = Config()
        config.behaviour_policy.initial_epsilon = 1
        config.behaviour_policy.final_epsilon = 0.1
        config.behaviour_policy.anneal_epsilon = True

        config3 = Config()
        config3.n = 3
        config3.gamma = 1
        config3.beta = 1
        config3.sigma = 0.5
        config3.initial_rand_steps = 5000
        config3.rand_steps_count = 0
        self.bpolicy3 = EpsilonGreedyPolicy(config, behaviour_policy=True)
        self.fa3 = TileCoderFA(config)
        self.agent3 = QSigma(config=config3, environment=self.env, function_approximator=self.fa3,
                             target_policy=self.tpolicy, behaviour_policy=self.bpolicy3)

    def test_train(self):
        print("\n############ Testing Training Function ##############")
        print("Training 50 episodes:")
        for i in range(50):
            # print("\tTraining episode:", i+1)
            self.agent1.train(1)

        print("\tThe average return after 50 episodes is:", np.average(self.summary['return_per_episode']))

        print("Training 450 more episodes:")
        for i in range(9):
            print("\tTraining", 50, "more episodes...")
            self.agent1.train(50)
            print("\tThe average return after", (i+1) * 50 + 50, "episodes is:",
                  np.average(self.summary['return_per_episode']))

    def test_annealing_epsilon(self):
        print("\n############ Testing Annealing Epsilon ###############")
        print("The initial epsilon is:", self.agent2.bpolicy.initial_epsilon)
        print("The final epsilon is:", self.agent2.bpolicy.final_epsilon)
        print("The annealing period is:", self.agent2.bpolicy.annealing_period)
        print("Training for 1 episodes...")
        self.agent2.train(1)
        print("The current epsilon is:", self.bpolicy2.epsilon)
        print("The epsilon of the target policy is:", self.tpolicy.epsilon)
        print("Training for 10 more episodes...")
        self.agent2.train(10)
        print("The current epsilon is:", self.bpolicy2.epsilon)
        print("The epsilon of the target policy is:", self.tpolicy.epsilon)
        print("Training for 100 more episodes...")
        self.agent2.train(100)
        print("The current epsilon is:", self.bpolicy2.epsilon)
        print("The epsilon of the target policy is:", self.tpolicy.epsilon)


    def test_steps_before_training(self):
        print("\n############ Testing Steps Before Training ###############")
        print("The initial epsilon is:", self.agent3.bpolicy.initial_epsilon)
        print("The final epsilon is:", self.agent3.bpolicy.final_epsilon)
        print("The annealing period is:", self.agent3.bpolicy.annealing_period)
        print("The number of steps before training is:", self.agent3.config.initial_rand_steps)
        print("The current number of steps before training is:",
              self.agent3.config.rand_steps_count)
        print("Training for 1 episodes...")
        self.agent3.train(1)
        print("The current epsilon is:", self.bpolicy3.epsilon)
        print("The epsilon of the target policy is:", self.tpolicy.epsilon)
        print("The current number of steps before training is:", self.agent3.config.rand_steps_count)
        print("Training for 10 more episodes...")
        self.agent3.train(10)
        print("The current epsilon is:", self.bpolicy3.epsilon)
        print("The epsilon of the target policy is:", self.tpolicy.epsilon)
        print("The current number of steps before training is:",
              self.agent3.config.rand_steps_count)
        print("Training for 100 more episodes...")
        self.agent3.train(100)
        print("The current epsilon is:", self.bpolicy3.epsilon)
        print("The epsilon of the target policy is:", self.tpolicy.epsilon)
        print("The current number of steps before training is:",
              self.agent3.config.rand_steps_count)