def __init__(self, config, global_action_id_to_primitive_actions, action_length_reward_bonus, end_of_episode_symbol="/"): super().__init__(config) self.end_of_episode_symbol = end_of_episode_symbol self.global_action_id_to_primitive_actions = global_action_id_to_primitive_actions self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed) self.exploration_strategy = Epsilon_Greedy_Exploration(config) self.oracle = self.create_oracle() self.oracle_optimizer = optim.Adam( self.oracle.parameters(), lr=self.hyperparameters["learning_rate"]) self.q_network_local = self.create_NN(input_dim=self.state_size + 1, output_dim=self.action_size) self.q_network_local.print_model_summary() self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"]) self.q_network_target = self.create_NN(input_dim=self.state_size + 1, output_dim=self.action_size) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) self.action_length_reward_bonus = action_length_reward_bonus self.abandon_ship = config.hyperparameters["abandon_ship"]
def __init__(self, config): Base_Agent.__init__(self, config) self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed) self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"]) self.exploration_strategy = Epsilon_Greedy_Exploration(config)
def __init__(self, config): Base_Agent.__init__(self, config) base_config.no_render_mode = False ## must be render mode self.q_network_local = q_network_2_EYE(n_action=self.get_action_size()) self.q_network_target = q_network_2_EYE( n_action=self.get_action_size()) self.q_network_optimizer = optim.SGD( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], weight_decay=5e-4) self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed) self.exploration_strategy = Epsilon_Greedy_Exploration(config) if config.backbone_pretrain: self.load_pretrain() self.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) self.q_network_local.to(self.q_network_local.device) self.q_network_target.to(self.q_network_target.device)
def run(self): """Starts the worker""" for ep_ix in range(self.episodes_to_run): with self.optimizer_lock: Base_Agent.copy_model_over(self.shared_model, self.local_model) epsilon_exploration = self.calculate_new_exploration() state = self.reset_game_for_worker() done = False self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_log_action_probabilities = [] self.critic_outputs = [] while not done: action, action_log_prob, critic_outputs = self.pick_action_and_get_critic_values( self.local_model, state, epsilon_exploration) next_state, reward, done, _ = self.environment.step(action) self.episode_states.append(state) self.episode_actions.append(action) self.episode_rewards.append(reward) self.episode_log_action_probabilities.append(action_log_prob) self.critic_outputs.append(critic_outputs) state = next_state total_loss = self.calculate_total_loss() self.put_gradients_in_queue(total_loss) self.episode_number += 1 with self.counter.get_lock(): self.counter.value += 1 self.results_queue.put(np.sum(self.episode_rewards))
def __init__(self, config): Base_Agent.__init__(self, config) self.policy_output_size = self.calculate_policy_output_size() self.policy_new = self.create_NN(input_dim=self.state_size, output_dim=self.policy_output_size) model_path = self.config.model_path if self.config.model_path else 'Models' self.policy_new_path = os.path.join( model_path, "{}_policy_new.pt".format(self.agent_name)) if self.config.load_model: self.locally_load_policy() self.policy_old = self.create_NN(input_dim=self.state_size, output_dim=self.policy_output_size) self.policy_old.load_state_dict( copy.deepcopy(self.policy_new.state_dict())) self.policy_new_optimizer = optim.Adam( self.policy_new.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) self.episode_number = 0 self.many_episode_states = [] self.many_episode_actions = [] self.many_episode_rewards = [] self.experience_generator = Parallel_Experience_Generator( self.environment, self.policy_new, self.config.seed, self.hyperparameters, self.action_size) self.exploration_strategy = Epsilon_Greedy_Exploration(self.config)
def __init__(self, config, agent_name_=agent_name): DDQN.__init__(self, config, agent_name_=agent_name_) self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) self.wandb_watch(self.q_network_local, log_freq=self.config.wandb_model_log_freq)
def __init__(self, config): Base_Agent.__init__(self, config) self.policy = self.create_NN( input_dim=self.state_size, output_dim=self.action_size) self.optimizer = optim.Adam( self.policy.parameters(), lr=self.hyperparameters["learning_rate"]) self.episode_rewards = [] self.episode_log_probabilities = []
def __init__(self, config): DQN.__init__(self, config) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) if config.resume: self.load_resume(config.resume_path)
def __init__(self, config): DDQN.__init__(self, config) model_path = self.config.model_path if self.config.model_path else 'Models' self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) self.q_network_local_path = os.path.join(model_path, "{}_q_network_local.pt".format(self.agent_name)) if self.config.load_model: self.locally_load_policy() self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
def __init__(self, config): DDQN.__init__(self, config) self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"]) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
def __init__(self, config): Base_Agent.__init__(self, config) self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed) self.q_network_local = Policy(self.state_size, self.action_size).to("cuda") self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) self.exploration_strategy = Epsilon_Greedy_Exploration(config)
def __init__(self, config): Base_Agent.__init__(self, config) self.policy_output_size = self.calculate_policy_output_size() self.policy_new = self.create_NN(input_dim=self.state_size, output_dim=self.policy_output_size) self.policy_old = self.create_NN(input_dim=self.state_size, output_dim=self.policy_output_size) self.policy_old.load_state_dict(copy.deepcopy(self.policy_new.state_dict())) self.policy_new_optimizer = optim.Adam(self.policy_new.parameters(), lr=self.hyperparameters["learning_rate"]) self.episode_number = 0 self.many_episode_states = [] self.many_episode_actions = [] self.many_episode_rewards = [] self.experience_generator = Parallel_Experience_Generator(self.environment, self.policy_new, self.config.seed, self.hyperparameters, self.action_size) self.exploration_strategy = Epsilon_Greedy_Exploration(self.config)
def __init__(self, config): DDPG.__init__(self, config) self.critic_local_2 = self.create_NN( input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic", override_seed=self.config.seed + 1) self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) self.critic_optimizer_2 = optim.Adam( self.critic_local_2.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.exploration_strategy_critic = Gaussian_Exploration(self.config)
def __init__(self, config, agent_name_=agent_name): Base_Agent.__init__(self, config, agent_name=agent_name_) self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed, self.device) self.q_network_local = self.create_NN( input_dim=self.state_size, output_dim=self.action_size) # TODO: Change NN self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) self.exploration_strategy = Epsilon_Greedy_Exploration(config) self.wandb_watch(self.q_network_local, log_freq=self.config.wandb_model_log_freq)
def __init__(self, config): Base_Agent.__init__(self, config) self.controller_config = copy.deepcopy(config) self.controller_config.hyperparameters = self.controller_config.hyperparameters[ "CONTROLLER"] self.controller = DDQN(self.controller_config) self.controller.q_network_local = self.create_NN( input_dim=self.state_size * 2, output_dim=self.action_size, key_to_use="CONTROLLER") self.controller.q_network_target = self.create_NN( input_dim=self.state_size * 2, output_dim=self.action_size, key_to_use="CONTROLLER") self.meta_controller_config = copy.deepcopy(config) self.meta_controller_config.hyperparameters = self.meta_controller_config.hyperparameters[ "META_CONTROLLER"] # self.meta_controller = DDQN(self.meta_controller_config) # self.meta_controller.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=config.environment.observation_space.n, # key_to_use="META_CONTROLLER") # self.meta_controller.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=config.environment.observation_space.n, # key_to_use="META_CONTROLLER") self.list_meta_controller = [ DDQN(self.meta_controller_config) for _ in range(5) ] self.lq_network_local = [] self.lq_network_target = [] for m in self.list_meta_controller: m.q_network_local = self.create_NN( input_dim=self.state_size, output_dim=config.environment.observation_space.n, key_to_use="META_CONTROLLER") self.lq_network_local.append(m.q_network_local) m.q_network_target = self.create_NN( input_dim=self.state_size, output_dim=config.environment.observation_space.n, key_to_use="META_CONTROLLER") self.lq_network_target.append(m.q_network_target) self.rolling_intrinsic_rewards = [] self.goals_seen = [] self.controller_learnt_enough = False self.controller_actions = []
def __init__(self, config): Base_Agent.__init__(self, config) model_path = self.config.model_path if self.config.model_path else 'Models' self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed) self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) self.q_network_local_path = os.path.join( model_path, "{}_q_network_local.pt".format(self.agent_name)) if self.config.load_model: self.locally_load_policy() self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) self.exploration_strategy = Epsilon_Greedy_Exploration(config)
def append_to_final_layers(self, num_new_actions): """Appends to the end of a network to allow it to choose from the new actions. It does not change the weights for the other actions""" print("Appending options to final layer") assert num_new_actions > 0 self.q_network_local.output_layers.append( nn.Linear( in_features=self.q_network_local.output_layers[0].in_features, out_features=num_new_actions)) self.q_network_target.output_layers.append( nn.Linear( in_features=self.q_network_local.output_layers[0].in_features, out_features=num_new_actions)) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"])
def __init__(self, config): Base_Agent.__init__(self, config) assert isinstance(self.environment.reset(), int) or isinstance(self.environment.reset( ), np.int64) or self.environment.reset().dtype == np.int64, "only works for discrete states currently" self.num_skills = self.hyperparameters["SKILL_AGENT"]["num_skills"] self.episodes_for_pretraining = self.hyperparameters[ "SKILL_AGENT"]["episodes_for_pretraining"] self.timesteps_before_changing_skill = self.hyperparameters[ "MANAGER"]["timesteps_before_changing_skill"] self.skill_agent_config = copy.deepcopy(config) self.skill_agent_config.hyperparameters = self.skill_agent_config.hyperparameters[ "SKILL_AGENT"] self.skill_agent_config.num_episodes_to_run = self.episodes_for_pretraining self.manager_config = copy.deepcopy(config) self.manager_config.hyperparameters = self.manager_config.hyperparameters["MANAGER"] self.manager_config.num_episodes_to_run = self.config.num_episodes_to_run - \ self.skill_agent_config.num_episodes_to_run
def __init__(self, config, global_action_id_to_primitive_action, end_of_episode_symbol="/"): super().__init__(config) self.state_size += 1 self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"]) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) self.min_episode_score_seen = float("inf") self.end_of_episode_symbol = end_of_episode_symbol self.global_action_id_to_primitive_action = global_action_id_to_primitive_action self.action_id_to_stepping_stone_action_id = {} self.calculate_q_values_as_increments = self.config.hyperparameters[ "calculate_q_values_as_increments"] self.abandon_ship = self.config.hyperparameters["abandon_ship"] self.pre_training_learning_iterations_multiplier = self.hyperparameters[ "pre_training_learning_iterations_multiplier"] self.copy_over_hidden_layers = self.hyperparameters[ "copy_over_hidden_layers"] self.action_balanced_replay_buffer = self.hyperparameters[ "action_balanced_replay_buffer"] self.original_primitive_actions = list(range(self.action_size)) self.memory_shaper = Memory_Shaper( self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed, self.update_reward_to_encourage_longer_macro_actions, self.action_balanced_replay_buffer) self.action_length_reward_bonus = self.hyperparameters[ "action_length_reward_bonus"] self.only_train_new_actions = self.hyperparameters[ "only_train_new_actions"] self.only_train_final_layer = self.hyperparameters[ "only_train_final_layer"]
def change_final_layer_q_network(self, copy_over_hidden_layers): """Completely changes the final layer of the q network to accomodate the new action space""" print("Completely changing final layer") assert len(self.q_network_local.output_layers) == 1 if copy_over_hidden_layers: self.q_network_local.output_layers[0] = nn.Linear( in_features=self.q_network_local.output_layers[0].in_features, out_features=self.action_size) self.q_network_target.output_layers[0] = nn.Linear( in_features=self.q_network_target.output_layers[0].in_features, out_features=self.action_size) else: self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"])
def __init__(self, config): Base_Agent.__init__(self, config) assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions" assert self.config.hyperparameters["Actor"]["final_layer_activation"] == "Softmax", "Final actor layer must be softmax" self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") self.critic_local_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic", override_seed=self.config.seed + 1) self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.critic_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") self.critic_target_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"]) self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"] if self.automatic_entropy_tuning: self.target_entropy = -torch.prod(torch.Tensor(self.environment.action_space.shape).to(self.device)).item() # heuristic value from the paper self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"]) else: self.alpha = self.hyperparameters["entropy_term_weight"] assert not self.hyperparameters["add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment" self.add_extra_noise = False self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"]
def __init__(self, config): Base_Agent.__init__(self, config) self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") Base_Agent.copy_model_over(self.actor_local, self.actor_target) self.actor_optimizer = optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) self.exploration_strategy = OU_Noise_Exploration(self.config)
def __init__(self, config): Base_Agent.__init__(self, config) self.controller_config = copy.deepcopy(config) self.controller_config.hyperparameters = self.controller_config.hyperparameters[ "CONTROLLER"] self.controller = DDQN(self.controller_config) self.controller.q_network_local = self.create_NN( input_dim=self.state_size * 2, output_dim=self.action_size, key_to_use="CONTROLLER") self.meta_controller_config = copy.deepcopy(config) self.meta_controller_config.hyperparameters = self.meta_controller_config.hyperparameters[ "META_CONTROLLER"] self.meta_controller = DDQN(self.meta_controller_config) self.meta_controller.q_network_local = self.create_NN( input_dim=self.state_size, output_dim=config.environment.observation_space.n, key_to_use="META_CONTROLLER") self.rolling_intrinsic_rewards = [] self.goals_seen = [] self.controller_learnt_enough = False self.controller_actions = []
def __init__(self, config, agent_name_=agent_name): Base_Agent.__init__(self, config, agent_name=agent_name_) self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed, self.device) # If model is not provided, create one. TODO Add this mechanism to all agents. if not "model" in self.hyperparameters or self.hyperparameters[ "model"] is None: self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) else: self.q_network_local = self.hyperparameters["model"] self.wandb_watch(self.q_network_local, log_freq=self.config.wandb_model_log_freq) self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) self.exploration_strategy = Epsilon_Greedy_Exploration(config)
def __init__(self, config): Base_Agent.__init__(self, config) self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") Base_Agent.copy_model_over(self.actor_local, self.actor_target) self.actor_optimizer = optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) self.exploration_strategy = OU_Noise_Exploration(self.config) if self.video_mode: self.file_name = self.environment_title + "_" + self.agent_name + "_videos" for i in range(config.num_episodes_to_run): pathset = os.path.join(self.file_name) if not (os.path.exists(pathset)): os.mkdir(pathset) # f = tables.open_file(self.file_name, mode = 'w') # f.close() # datainfo = "DDPG_"+ self.environment_title + "_info.txt" # f = open(self.file_name, 'w') # f.close() # f = open(datainfo, 'w') # f.write(str(self.height)) # f.write(str(self.width)) # f.write(str(self.channel)) # f.write(str(config.max_step)) # f.write(str(config.num_episodes_to_run)) # f.close() self.save_max_result_list_list = []
def __init__(self, config): Base_Agent.__init__(self, config) self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") model_path = self.config.model_path if self.config.model_path else 'Models' self.critic_local_path = os.path.join( model_path, "{}_critic_local.pt".format(self.agent_name)) self.critic_local_2_path = os.path.join( model_path, "{}_critic_local_2.pt".format(self.agent_name)) self.actor_local_path = os.path.join( model_path, "{}_actor_local.pt".format(self.agent_name)) if self.config.load_model: self.locally_load_policy() Base_Agent.copy_model_over(self.critic_local, self.critic_target) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") if self.config.load_model: self.locally_load_policy() Base_Agent.copy_model_over(self.actor_local, self.actor_target) self.actor_optimizer = optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) self.exploration_strategy = OU_Noise_Exploration(self.config)
def __init__(self, config, agent_name_=agent_name): Base_Agent.__init__(self, config, agent_name_=agent_name_) assert self.action_types == "CONTINUOUS", "Action types must be continuous. Use SAC Discrete instead for discrete actions" assert self.config.hyperparameters["Actor"]["final_layer_activation"] != "Softmax", "Final actor layer must not be softmax" self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_local_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic", override_seed=self.config.seed + 1) self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size * 2, key_to_use="Actor") self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"] if self.automatic_entropy_tuning: self.target_entropy = -torch.prod(torch.Tensor(self.environment.action_space.shape).to(self.device)).item() # heuristic value from the paper self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) else: self.alpha = self.hyperparameters["entropy_term_weight"] self.add_extra_noise = self.hyperparameters["add_extra_noise"] if self.add_extra_noise: self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"], self.hyperparameters["theta"], self.hyperparameters["sigma"]) self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"] self.wandb_watch(self.actor_local, log_freq=self.config.wandb_model_log_freq)
def reset_game(self): """Resets the game information so we are ready to play a new episode""" Base_Agent.reset_game(self) if self.add_extra_noise: self.noise.reset()
def __init__(self, config, agent_name_=agent_name): Base_Agent.__init__(self, config, agent_name_=agent_name_) assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions" assert self.config.hyperparameters["Actor"][ "final_layer_activation"] == "Softmax", "Final actor layer must be softmax" self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") self.critic_local_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic", override_seed=self.config.seed + 1) self.critic_optimizer = torch.optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.critic_optimizer_2 = torch.optim.Adam( self.critic_local_2.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.critic_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") self.critic_target_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed, device=self.device) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.actor_optimizer = torch.optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) self.automatic_entropy_tuning = self.hyperparameters[ "automatically_tune_entropy_hyperparameter"] if self.automatic_entropy_tuning: # we set the max possible entropy as the target entropy self.target_entropy = -np.log((1.0 / self.action_size)) * 0.98 self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam( [self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) else: self.alpha = self.hyperparameters["entropy_term_weight"] assert not self.hyperparameters[ "add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment" self.add_extra_noise = False self.do_evaluation_iterations = self.hyperparameters[ "do_evaluation_iterations"] self.wandb_watch(self.actor_local, log_freq=self.config.wandb_model_log_freq)
def __init__(self, config, agent_name_=agent_name): DQN.__init__(self, config, agent_name_=agent_name_) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)