def __init__(self, config): Base_Agent.__init__(self, config) assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions" assert self.config.hyperparameters["Actor"]["final_layer_activation"] == "Softmax", "Final actor layer must be softmax" self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") self.critic_local_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic", override_seed=self.config.seed + 1) self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.critic_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") self.critic_target_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"]) self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"] if self.automatic_entropy_tuning: self.target_entropy = -torch.prod(torch.Tensor(self.environment.action_space.shape).to(self.device)).item() # heuristic value from the paper self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"]) else: self.alpha = self.hyperparameters["entropy_term_weight"] assert not self.hyperparameters["add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment" self.add_extra_noise = False self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"]
def __init__(self, config): Base_Agent.__init__(self, config) self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") Base_Agent.copy_model_over(self.actor_local, self.actor_target) self.actor_optimizer = optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) self.exploration_strategy = OU_Noise_Exploration(self.config)
def run(self): """Starts the worker""" for ep_ix in range(self.episodes_to_run): with self.optimizer_lock: Base_Agent.copy_model_over(self.shared_model, self.local_model) epsilon_exploration = self.calculate_new_exploration() state = self.reset_game_for_worker() done = False self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_log_action_probabilities = [] self.critic_outputs = [] while not done: action, action_log_prob, critic_outputs = self.pick_action_and_get_critic_values( self.local_model, state, epsilon_exploration) next_state, reward, done, _ = self.environment.step(action) self.episode_states.append(state) self.episode_actions.append(action) self.episode_rewards.append(reward) self.episode_log_action_probabilities.append(action_log_prob) self.critic_outputs.append(critic_outputs) state = next_state total_loss = self.calculate_total_loss() self.put_gradients_in_queue(total_loss) self.episode_number += 1 with self.counter.get_lock(): self.counter.value += 1 self.results_queue.put(np.sum(self.episode_rewards))
def __init__(self, config, global_action_id_to_primitive_actions, action_length_reward_bonus, end_of_episode_symbol="/"): super().__init__(config) self.end_of_episode_symbol = end_of_episode_symbol self.global_action_id_to_primitive_actions = global_action_id_to_primitive_actions self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed) self.exploration_strategy = Epsilon_Greedy_Exploration(config) self.oracle = self.create_oracle() self.oracle_optimizer = optim.Adam( self.oracle.parameters(), lr=self.hyperparameters["learning_rate"]) self.q_network_local = self.create_NN(input_dim=self.state_size + 1, output_dim=self.action_size) self.q_network_local.print_model_summary() self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"]) self.q_network_target = self.create_NN(input_dim=self.state_size + 1, output_dim=self.action_size) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) self.action_length_reward_bonus = action_length_reward_bonus self.abandon_ship = config.hyperparameters["abandon_ship"]
def __init__(self, config, agent_name_=agent_name): DDQN.__init__(self, config, agent_name_=agent_name_) self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) self.wandb_watch(self.q_network_local, log_freq=self.config.wandb_model_log_freq)
def __init__(self, config): DQN.__init__(self, config) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) if config.resume: self.load_resume(config.resume_path)
def __init__(self, config): DDQN.__init__(self, config) self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"]) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
def __init__(self, config): DDQN.__init__(self, config) model_path = self.config.model_path if self.config.model_path else 'Models' self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) self.q_network_local_path = os.path.join(model_path, "{}_q_network_local.pt".format(self.agent_name)) if self.config.load_model: self.locally_load_policy() self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
def __init__(self, config): Base_Agent.__init__(self, config) self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") Base_Agent.copy_model_over(self.actor_local, self.actor_target) self.actor_optimizer = optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) self.exploration_strategy = OU_Noise_Exploration(self.config) if self.video_mode: self.file_name = self.environment_title + "_" + self.agent_name + "_videos" for i in range(config.num_episodes_to_run): pathset = os.path.join(self.file_name) if not (os.path.exists(pathset)): os.mkdir(pathset) # f = tables.open_file(self.file_name, mode = 'w') # f.close() # datainfo = "DDPG_"+ self.environment_title + "_info.txt" # f = open(self.file_name, 'w') # f.close() # f = open(datainfo, 'w') # f.write(str(self.height)) # f.write(str(self.width)) # f.write(str(self.channel)) # f.write(str(config.max_step)) # f.write(str(config.num_episodes_to_run)) # f.close() self.save_max_result_list_list = []
def __init__(self, config): DDPG.__init__(self, config) self.critic_local_2 = self.create_NN( input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic", override_seed=self.config.seed + 1) self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) self.critic_optimizer_2 = optim.Adam( self.critic_local_2.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"]) self.exploration_strategy_critic = Gaussian_Exploration(self.config)
def append_to_final_layers(self, num_new_actions): """Appends to the end of a network to allow it to choose from the new actions. It does not change the weights for the other actions""" print("Appending options to final layer") assert num_new_actions > 0 self.q_network_local.output_layers.append( nn.Linear( in_features=self.q_network_local.output_layers[0].in_features, out_features=num_new_actions)) self.q_network_target.output_layers.append( nn.Linear( in_features=self.q_network_local.output_layers[0].in_features, out_features=num_new_actions)) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"])
def __init__(self, config): Base_Agent.__init__(self, config) self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") model_path = self.config.model_path if self.config.model_path else 'Models' self.critic_local_path = os.path.join( model_path, "{}_critic_local.pt".format(self.agent_name)) self.critic_local_2_path = os.path.join( model_path, "{}_critic_local_2.pt".format(self.agent_name)) self.actor_local_path = os.path.join( model_path, "{}_actor_local.pt".format(self.agent_name)) if self.config.load_model: self.locally_load_policy() Base_Agent.copy_model_over(self.critic_local, self.critic_target) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") if self.config.load_model: self.locally_load_policy() Base_Agent.copy_model_over(self.actor_local, self.actor_target) self.actor_optimizer = optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) self.exploration_strategy = OU_Noise_Exploration(self.config)
def __init__(self, config, global_action_id_to_primitive_action, end_of_episode_symbol="/"): super().__init__(config) self.state_size += 1 self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"]) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) self.min_episode_score_seen = float("inf") self.end_of_episode_symbol = end_of_episode_symbol self.global_action_id_to_primitive_action = global_action_id_to_primitive_action self.action_id_to_stepping_stone_action_id = {} self.calculate_q_values_as_increments = self.config.hyperparameters[ "calculate_q_values_as_increments"] self.abandon_ship = self.config.hyperparameters["abandon_ship"] self.pre_training_learning_iterations_multiplier = self.hyperparameters[ "pre_training_learning_iterations_multiplier"] self.copy_over_hidden_layers = self.hyperparameters[ "copy_over_hidden_layers"] self.action_balanced_replay_buffer = self.hyperparameters[ "action_balanced_replay_buffer"] self.original_primitive_actions = list(range(self.action_size)) self.memory_shaper = Memory_Shaper( self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed, self.update_reward_to_encourage_longer_macro_actions, self.action_balanced_replay_buffer) self.action_length_reward_bonus = self.hyperparameters[ "action_length_reward_bonus"] self.only_train_new_actions = self.hyperparameters[ "only_train_new_actions"] self.only_train_final_layer = self.hyperparameters[ "only_train_final_layer"]
def change_final_layer_q_network(self, copy_over_hidden_layers): """Completely changes the final layer of the q network to accomodate the new action space""" print("Completely changing final layer") assert len(self.q_network_local.output_layers) == 1 if copy_over_hidden_layers: self.q_network_local.output_layers[0] = nn.Linear( in_features=self.q_network_local.output_layers[0].in_features, out_features=self.action_size) self.q_network_target.output_layers[0] = nn.Linear( in_features=self.q_network_target.output_layers[0].in_features, out_features=self.action_size) else: self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) self.q_network_optimizer = optim.Adam( self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"])
def __init__(self, config, agent_name_=agent_name): Base_Agent.__init__(self, config, agent_name_=agent_name_) assert self.action_types == "CONTINUOUS", "Action types must be continuous. Use SAC Discrete instead for discrete actions" assert self.config.hyperparameters["Actor"]["final_layer_activation"] != "Softmax", "Final actor layer must not be softmax" self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_local_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic", override_seed=self.config.seed + 1) self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size * 2, key_to_use="Actor") self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"] if self.automatic_entropy_tuning: self.target_entropy = -torch.prod(torch.Tensor(self.environment.action_space.shape).to(self.device)).item() # heuristic value from the paper self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) else: self.alpha = self.hyperparameters["entropy_term_weight"] self.add_extra_noise = self.hyperparameters["add_extra_noise"] if self.add_extra_noise: self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"], self.hyperparameters["theta"], self.hyperparameters["sigma"]) self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"] self.wandb_watch(self.actor_local, log_freq=self.config.wandb_model_log_freq)
def __init__(self, config, agent_name_=agent_name): DQN.__init__(self, config, agent_name_=agent_name_) self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
def __init__(self, config, agent_name_=agent_name): Base_Agent.__init__(self, config, agent_name_=agent_name_) assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions" assert self.config.hyperparameters["Actor"][ "final_layer_activation"] == "Softmax", "Final actor layer must be softmax" self.hyperparameters = config.hyperparameters self.critic_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") self.critic_local_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic", override_seed=self.config.seed + 1) self.critic_optimizer = torch.optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.critic_optimizer_2 = torch.optim.Adam( self.critic_local_2.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) self.critic_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") self.critic_target_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") Base_Agent.copy_model_over(self.critic_local, self.critic_target) Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) self.memory = Replay_Buffer( self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], self.config.seed, device=self.device) self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") self.actor_optimizer = torch.optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) self.automatic_entropy_tuning = self.hyperparameters[ "automatically_tune_entropy_hyperparameter"] if self.automatic_entropy_tuning: # we set the max possible entropy as the target entropy self.target_entropy = -np.log((1.0 / self.action_size)) * 0.98 self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam( [self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) else: self.alpha = self.hyperparameters["entropy_term_weight"] assert not self.hyperparameters[ "add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment" self.add_extra_noise = False self.do_evaluation_iterations = self.hyperparameters[ "do_evaluation_iterations"] self.wandb_watch(self.actor_local, log_freq=self.config.wandb_model_log_freq)
def __init__(self, config): DQN.__init__(self, config) self.q_network_target = Policy(self.state_size, self.action_size).to("cuda") Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)