Esempio n. 1
0
    def __init__(self, config):
        super().__init__(config)
        self.training_mode = True
        self.num_skills = config.hyperparameters["num_skills"] # check
        self.unsupervised_episodes = config.hyperparameters["num_unsupservised_episodes"] # check
        self.supervised_episodes = config.num_episodes_to_run - self.unsupervised_episodes

        assert self.hyperparameters["DISCRIMINATOR"]["final_layer_activation"] == None, "Final layer activation for disciminator should be None" # check
        self.discriminator = self.create_NN(self.state_size, self.num_skills, key_to_use="DISCRIMINATOR")
        self.discriminator_optimizer = optim.Adam(self.discriminator.parameters(),
                                              lr=self.hyperparameters["DISCRIMINATOR"]["learning_rate"]) # check
        self.agent_config = copy.deepcopy(config)
        self.agent_config.environment = DIAYN_Skill_Wrapper(copy.deepcopy(self.environment), self.num_skills, self)
        self.agent_config.hyperparameters = self.agent_config.hyperparameters["AGENT"]
        self.agent_config.hyperparameters["do_evaluation_iterations"] = False # check

        if 'Discrete' in config.environment[0].action_space:
            self.agent = SAC_Discrete(self.agent_config)
        else:
            self.agent = SAC(self.agent_config)  #We have to use SAC because it involves maximising the policy's entropy over actions which is also a part of DIAYN

        self.timesteps_to_give_up_control_for = self.hyperparameters["MANAGER"]["timesteps_to_give_up_control_for"] # check
        self.manager_agent_config = copy.deepcopy(config)
        self.manager_agent_config.environment = DIAYN_Manager_Agent_Wrapper(copy.deepcopy(self.environment), self.agent,
                                                                            self.timesteps_to_give_up_control_for, self.num_skills)
        self.manager_agent_config.hyperparameters = self.manager_agent_config.hyperparameters["MANAGER"]
        self.manager_agent = DDQN(self.manager_agent_config)
    def __init__(self, config):
        DDQN.__init__(self, config)
        self.memory = Prioritised_Replay_Buffer(self.hyperparameters,
                                                config.seed)

        if config.resume:
            self.load_resume(config.resume_path)
    def __init__(self, config, agent_name_=agent_name):
        DDQN.__init__(self, config, agent_name_=agent_name_)
        self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
        self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
        self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
        Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)

        self.wandb_watch(self.q_network_local, log_freq=self.config.wandb_model_log_freq)
Esempio n. 4
0
    def __init__(self, config):
        DDQN.__init__(self, config)

        model_path = self.config.model_path if self.config.model_path else 'Models'
        self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
        self.q_network_local_path = os.path.join(model_path, "{}_q_network_local.pt".format(self.agent_name))

        if self.config.load_model: self.locally_load_policy()
        self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
        self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
        Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.controller_config = copy.deepcopy(config)
        self.controller_config.hyperparameters = self.controller_config.hyperparameters[
            "CONTROLLER"]
        self.controller = DDQN(self.controller_config)
        self.controller.q_network_local = self.create_NN(
            input_dim=self.state_size * 2,
            output_dim=self.action_size,
            key_to_use="CONTROLLER")
        self.controller.q_network_target = self.create_NN(
            input_dim=self.state_size * 2,
            output_dim=self.action_size,
            key_to_use="CONTROLLER")

        self.meta_controller_config = copy.deepcopy(config)
        self.meta_controller_config.hyperparameters = self.meta_controller_config.hyperparameters[
            "META_CONTROLLER"]

        # self.meta_controller = DDQN(self.meta_controller_config)
        # self.meta_controller.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=config.environment.observation_space.n,
        #                                                       key_to_use="META_CONTROLLER")
        # self.meta_controller.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=config.environment.observation_space.n,
        #                                                       key_to_use="META_CONTROLLER")

        self.list_meta_controller = [
            DDQN(self.meta_controller_config) for _ in range(5)
        ]
        self.lq_network_local = []
        self.lq_network_target = []
        for m in self.list_meta_controller:
            m.q_network_local = self.create_NN(
                input_dim=self.state_size,
                output_dim=config.environment.observation_space.n,
                key_to_use="META_CONTROLLER")
            self.lq_network_local.append(m.q_network_local)
            m.q_network_target = self.create_NN(
                input_dim=self.state_size,
                output_dim=config.environment.observation_space.n,
                key_to_use="META_CONTROLLER")
            self.lq_network_target.append(m.q_network_target)

        self.rolling_intrinsic_rewards = []
        self.goals_seen = []
        self.controller_learnt_enough = False
        self.controller_actions = []
Esempio n. 6
0
 def create_skill_training_agent(self):
     """Creates and instantiates a pre-training environment for the agent to learn skills in and then instantiates
     and agent to learn in this environment"""
     self.skill_agent_config.environment = Skill_Wrapper(copy.deepcopy(self.environment), self.environment.observation_space.n,
                                                         self.num_skills,
                                                         self.skill_agent_config.hyperparameters[
                                                             "regularisation_weight"], self.skill_agent_config.hyperparameters["visitations_decay"])
     return DDQN(self.skill_agent_config)
Esempio n. 7
0
class DIAYN(Base_Agent):
    """Hierarchical RL agent based on the paper Diversity is all you need (2018) - https://arxiv.org/pdf/1802.06070.pdf.
    Works in two stages:
        1) First it trains an agent that tries to reach different states depending on which skill number is
           inputted
        2) Then it trains an agent to maximise reward using its choice of skill for the lower level agent"""
    agent_name = "DIAYN"
    def __init__(self, config):
        super().__init__(config)
        self.training_mode = True
        self.num_skills = config.hyperparameters["num_skills"] # check
        self.unsupervised_episodes = config.hyperparameters["num_unsupservised_episodes"] # check
        self.supervised_episodes = config.num_episodes_to_run - self.unsupervised_episodes

        assert self.hyperparameters["DISCRIMINATOR"]["final_layer_activation"] == None, "Final layer activation for disciminator should be None" # check
        self.discriminator = self.create_NN(self.state_size, self.num_skills, key_to_use="DISCRIMINATOR")
        self.discriminator_optimizer = optim.Adam(self.discriminator.parameters(),
                                              lr=self.hyperparameters["DISCRIMINATOR"]["learning_rate"]) # check
        self.agent_config = copy.deepcopy(config)
        self.agent_config.environment = DIAYN_Skill_Wrapper(copy.deepcopy(self.environment), self.num_skills, self)
        self.agent_config.hyperparameters = self.agent_config.hyperparameters["AGENT"]
        self.agent_config.hyperparameters["do_evaluation_iterations"] = False # check

        if 'Discrete' in config.environment[0].action_space:
            self.agent = SAC_Discrete(self.agent_config)
        else:
            self.agent = SAC(self.agent_config)  #We have to use SAC because it involves maximising the policy's entropy over actions which is also a part of DIAYN

        self.timesteps_to_give_up_control_for = self.hyperparameters["MANAGER"]["timesteps_to_give_up_control_for"] # check
        self.manager_agent_config = copy.deepcopy(config)
        self.manager_agent_config.environment = DIAYN_Manager_Agent_Wrapper(copy.deepcopy(self.environment), self.agent,
                                                                            self.timesteps_to_give_up_control_for, self.num_skills)
        self.manager_agent_config.hyperparameters = self.manager_agent_config.hyperparameters["MANAGER"]
        self.manager_agent = DDQN(self.manager_agent_config)

    def run_n_episodes(self, num_episodes=None, show_whether_achieved_goal=True, save_and_print_results=True):
        start = time.time()
        self.agent.run_n_episodes(num_episodes=self.unsupervised_episodes, show_whether_achieved_goal=False)
        game_full_episode_scores, rolling_results, _ = self.manager_agent.run_n_episodes(num_episodes=self.supervised_episodes)
        time_taken = time.time() - start
        pretraining_results = [np.min(self.agent.game_full_episode_scores)]*self.unsupervised_episodes
        return pretraining_results + game_full_episode_scores, pretraining_results + rolling_results, time_taken

    def disciminator_learn(self, skill, discriminator_outputs):
        if not self.training_mode: return
        assert isinstance(skill, int)
        assert discriminator_outputs.shape[0] == 1
        assert discriminator_outputs.shape[1] == self.num_skills
        loss = nn.CrossEntropyLoss()(discriminator_outputs, torch.Tensor([skill]).to("cuda:0").long())
        self.take_optimisation_step(self.discriminator_optimizer, self.discriminator, loss,
                                    self.hyperparameters["DISCRIMINATOR"]["gradient_clipping_norm"])

    def get_predicted_probability_of_skill(self, skill, next_state):
        """Gets the probability that the disciminator gives to the correct skill"""
        predicted_probabilities_unnormalised = self.discriminator(torch.Tensor(next_state).to("cuda:0").unsqueeze(0))
        probability_of_correct_skill = F.softmax(predicted_probabilities_unnormalised)[:, skill]
        return  probability_of_correct_skill.item(), predicted_probabilities_unnormalised
Esempio n. 8
0
 def __init__(self, config):
     Base_Agent.__init__(self, config)
     self.controller_config = copy.deepcopy(config)
     self.controller_config.hyperparameters = self.controller_config.hyperparameters[
         "CONTROLLER"]
     self.controller = DDQN(self.controller_config)
     self.controller.q_network_local = self.create_NN(
         input_dim=self.state_size * 2,
         output_dim=self.action_size,
         key_to_use="CONTROLLER")
     self.meta_controller_config = copy.deepcopy(config)
     self.meta_controller_config.hyperparameters = self.meta_controller_config.hyperparameters[
         "META_CONTROLLER"]
     self.meta_controller = DDQN(self.meta_controller_config)
     self.meta_controller.q_network_local = self.create_NN(
         input_dim=self.state_size,
         output_dim=config.environment.observation_space.n,
         key_to_use="META_CONTROLLER")
     self.rolling_intrinsic_rewards = []
     self.goals_seen = []
     self.controller_learnt_enough = False
     self.controller_actions = []
Esempio n. 9
0
class h_DQN(Base_Agent):
    """Implements hierarchical RL agent h-DQN from paper Kulkarni et al. (2016) https://arxiv.org/abs/1604.06057?context=stat
    Note also that this algorithm only works when we have discrete states and discrete actions currently because otherwise
    it is not clear what it means to achieve a subgoal state designated by the meta-controller"""
    agent_name = "h-DQN"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.controller_config = copy.deepcopy(config)
        self.controller_config.hyperparameters = self.controller_config.hyperparameters[
            "CONTROLLER"]
        self.controller = DDQN(self.controller_config)
        self.controller.q_network_local = self.create_NN(
            input_dim=self.state_size * 2,
            output_dim=self.action_size,
            key_to_use="CONTROLLER")
        self.meta_controller_config = copy.deepcopy(config)
        self.meta_controller_config.hyperparameters = self.meta_controller_config.hyperparameters[
            "META_CONTROLLER"]
        self.meta_controller = DDQN(self.meta_controller_config)
        self.meta_controller.q_network_local = self.create_NN(
            input_dim=self.state_size,
            output_dim=config.environment.observation_space.n,
            key_to_use="META_CONTROLLER")
        self.rolling_intrinsic_rewards = []
        self.goals_seen = []
        self.controller_learnt_enough = False
        self.controller_actions = []

    def reset_game(self):
        """Resets the game information so we are ready to play a new episode"""
        self.state = self.environment.reset()
        self.next_state = None
        self.action = None
        self.reward = None
        self.done = False
        self.cumulative_meta_controller_reward = 0
        self.episode_over = False
        self.subgoal_achieved = False
        self.total_episode_score_so_far = 0
        self.meta_controller_steps = 0
        self.update_learning_rate(
            self.controller_config.hyperparameters["learning_rate"],
            self.controller.q_network_optimizer)
        self.update_learning_rate(
            self.meta_controller_config.hyperparameters["learning_rate"],
            self.meta_controller.q_network_optimizer)

    def step(self):

        self.episode_steps = 0

        while not self.episode_over:
            episode_intrinsic_rewards = []
            self.meta_controller_state = self.environment.state
            self.subgoal = self.meta_controller.pick_action(
                state=self.meta_controller_state)
            self.goals_seen.append(self.subgoal)
            self.subgoal_achieved = False
            self.state = np.concatenate(
                (self.environment.state, np.array([self.subgoal])))
            self.cumulative_meta_controller_reward = 0

            while not (self.episode_over or self.subgoal_achieved):
                self.pick_and_conduct_controller_action()
                self.update_data()
                if self.time_to_learn(
                        self.controller.memory, self.global_step_number,
                        "CONTROLLER"):  #means it is time to train controller
                    for _ in range(self.hyperparameters["CONTROLLER"]
                                   ["learning_iterations"]):
                        self.controller.learn()
                self.save_experience(memory=self.controller.memory,
                                     experience=(self.state, self.action,
                                                 self.reward, self.next_state,
                                                 self.done))
                self.state = self.next_state  #this is to set the state for the next iteration
                self.global_step_number += 1
                episode_intrinsic_rewards.append(self.reward)

            if self.time_to_learn(self.meta_controller.memory,
                                  self.meta_controller_steps,
                                  "META_CONTROLLER"):
                for _ in range(self.hyperparameters["META_CONTROLLER"]
                               ["learning_iterations"]):
                    self.meta_controller.learn()

            self.save_experience(
                memory=self.meta_controller.memory,
                experience=(self.meta_controller_state, self.subgoal,
                            self.cumulative_meta_controller_reward,
                            self.meta_controller_next_state,
                            self.episode_over))
            self.meta_controller_steps += 1
            self.episode_steps += 1

        self.rolling_intrinsic_rewards.append(
            np.sum(episode_intrinsic_rewards))
        if self.episode_number % 100 == 0:
            print(" ")
            print("Most common goal -- {} -- ".format(
                max(set(self.goals_seen[-100:]),
                    key=self.goals_seen[-100:].count)))
            print("Intrinsic Rewards -- {} -- ".format(
                np.mean(self.rolling_intrinsic_rewards[-100:])))
            print("Average controller action -- {} ".format(
                np.mean(self.controller_actions[-100:])))
            print("Latest subgoal -- {}".format(self.goals_seen[-1]))
        self.episode_number += 1
        self.controller.episode_number += 1
        self.meta_controller.episode_number += 1

    def pick_and_conduct_controller_action(self):
        """Picks and conducts an action for controller"""
        self.action = self.controller.pick_action(state=self.state)
        self.controller_actions.append(self.action)
        self.conduct_action()

    def update_data(self):
        """Updates stored data for controller and meta-controller. It must occur in the order shown"""
        self.episode_over = self.environment.get_done()
        self.update_controller_data()
        self.update_meta_controller_data()

    def update_controller_data(self):
        """Gets the next state, reward and done information from the environment"""
        environment_next_state = self.environment.get_next_state()
        assert environment_next_state.shape[0] == 1
        self.next_state = np.concatenate(
            (environment_next_state, np.array([self.subgoal])))
        self.subgoal_achieved = environment_next_state[0] == self.subgoal
        self.reward = 1.0 * self.subgoal_achieved
        self.done = self.subgoal_achieved or self.episode_over

    def update_meta_controller_data(self):
        """Updates data relating to meta controller"""
        self.cumulative_meta_controller_reward += self.environment.get_reward()
        self.total_episode_score_so_far += self.environment.get_reward()
        if self.done:
            self.meta_controller_next_state = self.environment.get_next_state()

    def time_to_learn(self, memory, steps_taken, controller_name):
        """Boolean indicating whether it is time for meta-controller or controller to learn"""
        enough_experiences = len(
            memory) > self.hyperparameters[controller_name]["batch_size"]
        enough_steps_taken = steps_taken % self.hyperparameters[
            controller_name]["update_every_n_steps"] == 0
        return enough_experiences and enough_steps_taken
Esempio n. 10
0
 def __init__(self, config, agent_name_=agent_name):
     DDQN.__init__(self, config, agent_name_=agent_name_)
     self.memory = Prioritised_Replay_Buffer(self.hyperparameters,
                                             config.seed)
Esempio n. 11
0
 def __init__(self, config):
     DDQN.__init__(self, config)
     self.memory = Prioritised_Replay_Buffer(self.hyperparameters,
                                             config.seed, config.use_GPU)
Esempio n. 12
0
 def create_manager_agent(self, skill_agent):
     """Instantiates a manager agent"""
     self.manager_config.environment = Manager_Frozen_Worker_Wrapper(
         copy.deepcopy(self.environment), self.num_skills,
         self.timesteps_before_changing_skill, skill_agent)
     return DDQN(self.manager_config)