def __init__(self, config): super().__init__(config) self.training_mode = True self.num_skills = config.hyperparameters["num_skills"] # check self.unsupervised_episodes = config.hyperparameters["num_unsupservised_episodes"] # check self.supervised_episodes = config.num_episodes_to_run - self.unsupervised_episodes assert self.hyperparameters["DISCRIMINATOR"]["final_layer_activation"] == None, "Final layer activation for disciminator should be None" # check self.discriminator = self.create_NN(self.state_size, self.num_skills, key_to_use="DISCRIMINATOR") self.discriminator_optimizer = optim.Adam(self.discriminator.parameters(), lr=self.hyperparameters["DISCRIMINATOR"]["learning_rate"]) # check self.agent_config = copy.deepcopy(config) self.agent_config.environment = DIAYN_Skill_Wrapper(copy.deepcopy(self.environment), self.num_skills, self) self.agent_config.hyperparameters = self.agent_config.hyperparameters["AGENT"] self.agent_config.hyperparameters["do_evaluation_iterations"] = False # check if 'Discrete' in config.environment[0].action_space: self.agent = SAC_Discrete(self.agent_config) else: self.agent = SAC(self.agent_config) #We have to use SAC because it involves maximising the policy's entropy over actions which is also a part of DIAYN self.timesteps_to_give_up_control_for = self.hyperparameters["MANAGER"]["timesteps_to_give_up_control_for"] # check self.manager_agent_config = copy.deepcopy(config) self.manager_agent_config.environment = DIAYN_Manager_Agent_Wrapper(copy.deepcopy(self.environment), self.agent, self.timesteps_to_give_up_control_for, self.num_skills) self.manager_agent_config.hyperparameters = self.manager_agent_config.hyperparameters["MANAGER"] self.manager_agent = DDQN(self.manager_agent_config)
def __init__(self, config): DDQN.__init__(self, config) self.memory = Prioritised_Replay_Buffer(self.hyperparameters, config.seed) if config.resume: self.load_resume(config.resume_path)
def __init__(self, config, agent_name_=agent_name): DDQN.__init__(self, config, agent_name_=agent_name_) self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) self.wandb_watch(self.q_network_local, log_freq=self.config.wandb_model_log_freq)
def __init__(self, config): DDQN.__init__(self, config) model_path = self.config.model_path if self.config.model_path else 'Models' self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) self.q_network_local_path = os.path.join(model_path, "{}_q_network_local.pt".format(self.agent_name)) if self.config.load_model: self.locally_load_policy() self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
def __init__(self, config): Base_Agent.__init__(self, config) self.controller_config = copy.deepcopy(config) self.controller_config.hyperparameters = self.controller_config.hyperparameters[ "CONTROLLER"] self.controller = DDQN(self.controller_config) self.controller.q_network_local = self.create_NN( input_dim=self.state_size * 2, output_dim=self.action_size, key_to_use="CONTROLLER") self.controller.q_network_target = self.create_NN( input_dim=self.state_size * 2, output_dim=self.action_size, key_to_use="CONTROLLER") self.meta_controller_config = copy.deepcopy(config) self.meta_controller_config.hyperparameters = self.meta_controller_config.hyperparameters[ "META_CONTROLLER"] # self.meta_controller = DDQN(self.meta_controller_config) # self.meta_controller.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=config.environment.observation_space.n, # key_to_use="META_CONTROLLER") # self.meta_controller.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=config.environment.observation_space.n, # key_to_use="META_CONTROLLER") self.list_meta_controller = [ DDQN(self.meta_controller_config) for _ in range(5) ] self.lq_network_local = [] self.lq_network_target = [] for m in self.list_meta_controller: m.q_network_local = self.create_NN( input_dim=self.state_size, output_dim=config.environment.observation_space.n, key_to_use="META_CONTROLLER") self.lq_network_local.append(m.q_network_local) m.q_network_target = self.create_NN( input_dim=self.state_size, output_dim=config.environment.observation_space.n, key_to_use="META_CONTROLLER") self.lq_network_target.append(m.q_network_target) self.rolling_intrinsic_rewards = [] self.goals_seen = [] self.controller_learnt_enough = False self.controller_actions = []
def create_skill_training_agent(self): """Creates and instantiates a pre-training environment for the agent to learn skills in and then instantiates and agent to learn in this environment""" self.skill_agent_config.environment = Skill_Wrapper(copy.deepcopy(self.environment), self.environment.observation_space.n, self.num_skills, self.skill_agent_config.hyperparameters[ "regularisation_weight"], self.skill_agent_config.hyperparameters["visitations_decay"]) return DDQN(self.skill_agent_config)
class DIAYN(Base_Agent): """Hierarchical RL agent based on the paper Diversity is all you need (2018) - https://arxiv.org/pdf/1802.06070.pdf. Works in two stages: 1) First it trains an agent that tries to reach different states depending on which skill number is inputted 2) Then it trains an agent to maximise reward using its choice of skill for the lower level agent""" agent_name = "DIAYN" def __init__(self, config): super().__init__(config) self.training_mode = True self.num_skills = config.hyperparameters["num_skills"] # check self.unsupervised_episodes = config.hyperparameters["num_unsupservised_episodes"] # check self.supervised_episodes = config.num_episodes_to_run - self.unsupervised_episodes assert self.hyperparameters["DISCRIMINATOR"]["final_layer_activation"] == None, "Final layer activation for disciminator should be None" # check self.discriminator = self.create_NN(self.state_size, self.num_skills, key_to_use="DISCRIMINATOR") self.discriminator_optimizer = optim.Adam(self.discriminator.parameters(), lr=self.hyperparameters["DISCRIMINATOR"]["learning_rate"]) # check self.agent_config = copy.deepcopy(config) self.agent_config.environment = DIAYN_Skill_Wrapper(copy.deepcopy(self.environment), self.num_skills, self) self.agent_config.hyperparameters = self.agent_config.hyperparameters["AGENT"] self.agent_config.hyperparameters["do_evaluation_iterations"] = False # check if 'Discrete' in config.environment[0].action_space: self.agent = SAC_Discrete(self.agent_config) else: self.agent = SAC(self.agent_config) #We have to use SAC because it involves maximising the policy's entropy over actions which is also a part of DIAYN self.timesteps_to_give_up_control_for = self.hyperparameters["MANAGER"]["timesteps_to_give_up_control_for"] # check self.manager_agent_config = copy.deepcopy(config) self.manager_agent_config.environment = DIAYN_Manager_Agent_Wrapper(copy.deepcopy(self.environment), self.agent, self.timesteps_to_give_up_control_for, self.num_skills) self.manager_agent_config.hyperparameters = self.manager_agent_config.hyperparameters["MANAGER"] self.manager_agent = DDQN(self.manager_agent_config) def run_n_episodes(self, num_episodes=None, show_whether_achieved_goal=True, save_and_print_results=True): start = time.time() self.agent.run_n_episodes(num_episodes=self.unsupervised_episodes, show_whether_achieved_goal=False) game_full_episode_scores, rolling_results, _ = self.manager_agent.run_n_episodes(num_episodes=self.supervised_episodes) time_taken = time.time() - start pretraining_results = [np.min(self.agent.game_full_episode_scores)]*self.unsupervised_episodes return pretraining_results + game_full_episode_scores, pretraining_results + rolling_results, time_taken def disciminator_learn(self, skill, discriminator_outputs): if not self.training_mode: return assert isinstance(skill, int) assert discriminator_outputs.shape[0] == 1 assert discriminator_outputs.shape[1] == self.num_skills loss = nn.CrossEntropyLoss()(discriminator_outputs, torch.Tensor([skill]).to("cuda:0").long()) self.take_optimisation_step(self.discriminator_optimizer, self.discriminator, loss, self.hyperparameters["DISCRIMINATOR"]["gradient_clipping_norm"]) def get_predicted_probability_of_skill(self, skill, next_state): """Gets the probability that the disciminator gives to the correct skill""" predicted_probabilities_unnormalised = self.discriminator(torch.Tensor(next_state).to("cuda:0").unsqueeze(0)) probability_of_correct_skill = F.softmax(predicted_probabilities_unnormalised)[:, skill] return probability_of_correct_skill.item(), predicted_probabilities_unnormalised
def __init__(self, config): Base_Agent.__init__(self, config) self.controller_config = copy.deepcopy(config) self.controller_config.hyperparameters = self.controller_config.hyperparameters[ "CONTROLLER"] self.controller = DDQN(self.controller_config) self.controller.q_network_local = self.create_NN( input_dim=self.state_size * 2, output_dim=self.action_size, key_to_use="CONTROLLER") self.meta_controller_config = copy.deepcopy(config) self.meta_controller_config.hyperparameters = self.meta_controller_config.hyperparameters[ "META_CONTROLLER"] self.meta_controller = DDQN(self.meta_controller_config) self.meta_controller.q_network_local = self.create_NN( input_dim=self.state_size, output_dim=config.environment.observation_space.n, key_to_use="META_CONTROLLER") self.rolling_intrinsic_rewards = [] self.goals_seen = [] self.controller_learnt_enough = False self.controller_actions = []
class h_DQN(Base_Agent): """Implements hierarchical RL agent h-DQN from paper Kulkarni et al. (2016) https://arxiv.org/abs/1604.06057?context=stat Note also that this algorithm only works when we have discrete states and discrete actions currently because otherwise it is not clear what it means to achieve a subgoal state designated by the meta-controller""" agent_name = "h-DQN" def __init__(self, config): Base_Agent.__init__(self, config) self.controller_config = copy.deepcopy(config) self.controller_config.hyperparameters = self.controller_config.hyperparameters[ "CONTROLLER"] self.controller = DDQN(self.controller_config) self.controller.q_network_local = self.create_NN( input_dim=self.state_size * 2, output_dim=self.action_size, key_to_use="CONTROLLER") self.meta_controller_config = copy.deepcopy(config) self.meta_controller_config.hyperparameters = self.meta_controller_config.hyperparameters[ "META_CONTROLLER"] self.meta_controller = DDQN(self.meta_controller_config) self.meta_controller.q_network_local = self.create_NN( input_dim=self.state_size, output_dim=config.environment.observation_space.n, key_to_use="META_CONTROLLER") self.rolling_intrinsic_rewards = [] self.goals_seen = [] self.controller_learnt_enough = False self.controller_actions = [] def reset_game(self): """Resets the game information so we are ready to play a new episode""" self.state = self.environment.reset() self.next_state = None self.action = None self.reward = None self.done = False self.cumulative_meta_controller_reward = 0 self.episode_over = False self.subgoal_achieved = False self.total_episode_score_so_far = 0 self.meta_controller_steps = 0 self.update_learning_rate( self.controller_config.hyperparameters["learning_rate"], self.controller.q_network_optimizer) self.update_learning_rate( self.meta_controller_config.hyperparameters["learning_rate"], self.meta_controller.q_network_optimizer) def step(self): self.episode_steps = 0 while not self.episode_over: episode_intrinsic_rewards = [] self.meta_controller_state = self.environment.state self.subgoal = self.meta_controller.pick_action( state=self.meta_controller_state) self.goals_seen.append(self.subgoal) self.subgoal_achieved = False self.state = np.concatenate( (self.environment.state, np.array([self.subgoal]))) self.cumulative_meta_controller_reward = 0 while not (self.episode_over or self.subgoal_achieved): self.pick_and_conduct_controller_action() self.update_data() if self.time_to_learn( self.controller.memory, self.global_step_number, "CONTROLLER"): #means it is time to train controller for _ in range(self.hyperparameters["CONTROLLER"] ["learning_iterations"]): self.controller.learn() self.save_experience(memory=self.controller.memory, experience=(self.state, self.action, self.reward, self.next_state, self.done)) self.state = self.next_state #this is to set the state for the next iteration self.global_step_number += 1 episode_intrinsic_rewards.append(self.reward) if self.time_to_learn(self.meta_controller.memory, self.meta_controller_steps, "META_CONTROLLER"): for _ in range(self.hyperparameters["META_CONTROLLER"] ["learning_iterations"]): self.meta_controller.learn() self.save_experience( memory=self.meta_controller.memory, experience=(self.meta_controller_state, self.subgoal, self.cumulative_meta_controller_reward, self.meta_controller_next_state, self.episode_over)) self.meta_controller_steps += 1 self.episode_steps += 1 self.rolling_intrinsic_rewards.append( np.sum(episode_intrinsic_rewards)) if self.episode_number % 100 == 0: print(" ") print("Most common goal -- {} -- ".format( max(set(self.goals_seen[-100:]), key=self.goals_seen[-100:].count))) print("Intrinsic Rewards -- {} -- ".format( np.mean(self.rolling_intrinsic_rewards[-100:]))) print("Average controller action -- {} ".format( np.mean(self.controller_actions[-100:]))) print("Latest subgoal -- {}".format(self.goals_seen[-1])) self.episode_number += 1 self.controller.episode_number += 1 self.meta_controller.episode_number += 1 def pick_and_conduct_controller_action(self): """Picks and conducts an action for controller""" self.action = self.controller.pick_action(state=self.state) self.controller_actions.append(self.action) self.conduct_action() def update_data(self): """Updates stored data for controller and meta-controller. It must occur in the order shown""" self.episode_over = self.environment.get_done() self.update_controller_data() self.update_meta_controller_data() def update_controller_data(self): """Gets the next state, reward and done information from the environment""" environment_next_state = self.environment.get_next_state() assert environment_next_state.shape[0] == 1 self.next_state = np.concatenate( (environment_next_state, np.array([self.subgoal]))) self.subgoal_achieved = environment_next_state[0] == self.subgoal self.reward = 1.0 * self.subgoal_achieved self.done = self.subgoal_achieved or self.episode_over def update_meta_controller_data(self): """Updates data relating to meta controller""" self.cumulative_meta_controller_reward += self.environment.get_reward() self.total_episode_score_so_far += self.environment.get_reward() if self.done: self.meta_controller_next_state = self.environment.get_next_state() def time_to_learn(self, memory, steps_taken, controller_name): """Boolean indicating whether it is time for meta-controller or controller to learn""" enough_experiences = len( memory) > self.hyperparameters[controller_name]["batch_size"] enough_steps_taken = steps_taken % self.hyperparameters[ controller_name]["update_every_n_steps"] == 0 return enough_experiences and enough_steps_taken
def __init__(self, config, agent_name_=agent_name): DDQN.__init__(self, config, agent_name_=agent_name_) self.memory = Prioritised_Replay_Buffer(self.hyperparameters, config.seed)
def __init__(self, config): DDQN.__init__(self, config) self.memory = Prioritised_Replay_Buffer(self.hyperparameters, config.seed, config.use_GPU)
def create_manager_agent(self, skill_agent): """Instantiates a manager agent""" self.manager_config.environment = Manager_Frozen_Worker_Wrapper( copy.deepcopy(self.environment), self.num_skills, self.timesteps_before_changing_skill, skill_agent) return DDQN(self.manager_config)