def __init__(self, config):
        Base_Agent.__init__(self, config)
        assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions"
        assert self.config.hyperparameters["Actor"]["final_layer_activation"] == "Softmax", "Final actor layer must be softmax"
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic")
        self.critic_local_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
                                           key_to_use="Critic", override_seed=self.config.seed + 1)
        self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(),
                                                 lr=self.hyperparameters["Critic"]["learning_rate"])
        self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(),
                                                   lr=self.hyperparameters["Critic"]["learning_rate"])
        self.critic_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
                                           key_to_use="Critic")
        self.critic_target_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
                                            key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)
        Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
        self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
                                    self.config.seed)

        self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
        self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
                                          lr=self.hyperparameters["Actor"]["learning_rate"])
        self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"]
        if self.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(torch.Tensor(self.environment.action_space.shape).to(self.device)).item() # heuristic value from the paper
            self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"])
        else:
            self.alpha = self.hyperparameters["entropy_term_weight"]
        assert not self.hyperparameters["add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment"
        self.add_extra_noise = False
        self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"]
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.actor_target = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Actor")
        Base_Agent.copy_model_over(self.actor_local, self.actor_target)

        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = OU_Noise_Exploration(self.config)
Exemple #3
0
    def run(self):
        """Starts the worker"""
        for ep_ix in range(self.episodes_to_run):
            with self.optimizer_lock:
                Base_Agent.copy_model_over(self.shared_model, self.local_model)
            epsilon_exploration = self.calculate_new_exploration()
            state = self.reset_game_for_worker()
            done = False
            self.episode_states = []
            self.episode_actions = []
            self.episode_rewards = []
            self.episode_log_action_probabilities = []
            self.critic_outputs = []

            while not done:
                action, action_log_prob, critic_outputs = self.pick_action_and_get_critic_values(
                    self.local_model, state, epsilon_exploration)
                next_state, reward, done, _ = self.environment.step(action)
                self.episode_states.append(state)
                self.episode_actions.append(action)
                self.episode_rewards.append(reward)
                self.episode_log_action_probabilities.append(action_log_prob)
                self.critic_outputs.append(critic_outputs)
                state = next_state

            total_loss = self.calculate_total_loss()
            self.put_gradients_in_queue(total_loss)
            self.episode_number += 1
            with self.counter.get_lock():
                self.counter.value += 1
                self.results_queue.put(np.sum(self.episode_rewards))
Exemple #4
0
    def __init__(self,
                 config,
                 global_action_id_to_primitive_actions,
                 action_length_reward_bonus,
                 end_of_episode_symbol="/"):
        super().__init__(config)
        self.end_of_episode_symbol = end_of_episode_symbol
        self.global_action_id_to_primitive_actions = global_action_id_to_primitive_actions
        self.memory = Replay_Buffer(self.hyperparameters["buffer_size"],
                                    self.hyperparameters["batch_size"],
                                    config.seed)
        self.exploration_strategy = Epsilon_Greedy_Exploration(config)

        self.oracle = self.create_oracle()
        self.oracle_optimizer = optim.Adam(
            self.oracle.parameters(), lr=self.hyperparameters["learning_rate"])

        self.q_network_local = self.create_NN(input_dim=self.state_size + 1,
                                              output_dim=self.action_size)
        self.q_network_local.print_model_summary()
        self.q_network_optimizer = optim.Adam(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"])
        self.q_network_target = self.create_NN(input_dim=self.state_size + 1,
                                               output_dim=self.action_size)
        Base_Agent.copy_model_over(from_model=self.q_network_local,
                                   to_model=self.q_network_target)

        self.action_length_reward_bonus = action_length_reward_bonus
        self.abandon_ship = config.hyperparameters["abandon_ship"]
    def __init__(self, config, agent_name_=agent_name):
        DDQN.__init__(self, config, agent_name_=agent_name_)
        self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
        self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
        self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
        Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)

        self.wandb_watch(self.q_network_local, log_freq=self.config.wandb_model_log_freq)
    def __init__(self, config):
        DQN.__init__(self, config)
        self.q_network_target = self.create_NN(input_dim=self.state_size,
                                               output_dim=self.action_size)
        Base_Agent.copy_model_over(from_model=self.q_network_local,
                                   to_model=self.q_network_target)

        if config.resume:
            self.load_resume(config.resume_path)
 def __init__(self, config):
     DDQN.__init__(self, config)
     self.q_network_local = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size + 1)
     self.q_network_optimizer = optim.Adam(
         self.q_network_local.parameters(),
         lr=self.hyperparameters["learning_rate"])
     self.q_network_target = self.create_NN(input_dim=self.state_size,
                                            output_dim=self.action_size + 1)
     Base_Agent.copy_model_over(from_model=self.q_network_local,
                                to_model=self.q_network_target)
Exemple #8
0
    def __init__(self, config):
        DDQN.__init__(self, config)

        model_path = self.config.model_path if self.config.model_path else 'Models'
        self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
        self.q_network_local_path = os.path.join(model_path, "{}_q_network_local.pt".format(self.agent_name))

        if self.config.load_model: self.locally_load_policy()
        self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
        self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
        Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.actor_target = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Actor")
        Base_Agent.copy_model_over(self.actor_local, self.actor_target)

        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = OU_Noise_Exploration(self.config)

        if self.video_mode:
            self.file_name = self.environment_title + "_" + self.agent_name + "_videos"
            for i in range(config.num_episodes_to_run):
                pathset = os.path.join(self.file_name)
                if not (os.path.exists(pathset)):
                    os.mkdir(pathset)
            # f = tables.open_file(self.file_name, mode = 'w')
            # f.close()
            # datainfo = "DDPG_"+ self.environment_title + "_info.txt"
            # f = open(self.file_name, 'w')
            # f.close()
            # f = open(datainfo, 'w')
            # f.write(str(self.height))
            # f.write(str(self.width))
            # f.write(str(self.channel))
            # f.write(str(config.max_step))
            # f.write(str(config.num_episodes_to_run))
            # f.close()
        self.save_max_result_list_list = []
 def __init__(self, config):
     DDPG.__init__(self, config)
     self.critic_local_2 = self.create_NN(
         input_dim=self.state_size + self.action_size,
         output_dim=1,
         key_to_use="Critic",
         override_seed=self.config.seed + 1)
     self.critic_target_2 = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
     Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
     self.critic_optimizer_2 = optim.Adam(
         self.critic_local_2.parameters(),
         lr=self.hyperparameters["Critic"]["learning_rate"])
     self.exploration_strategy_critic = Gaussian_Exploration(self.config)
 def append_to_final_layers(self, num_new_actions):
     """Appends to the end of a network to allow it to choose from the new actions. It does not change the weights
     for the other actions"""
     print("Appending options to final layer")
     assert num_new_actions > 0
     self.q_network_local.output_layers.append(
         nn.Linear(
             in_features=self.q_network_local.output_layers[0].in_features,
             out_features=num_new_actions))
     self.q_network_target.output_layers.append(
         nn.Linear(
             in_features=self.q_network_local.output_layers[0].in_features,
             out_features=num_new_actions))
     Base_Agent.copy_model_over(from_model=self.q_network_local,
                                to_model=self.q_network_target)
     self.q_network_optimizer = optim.Adam(
         self.q_network_local.parameters(),
         lr=self.hyperparameters["learning_rate"])
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        model_path = self.config.model_path if self.config.model_path else 'Models'
        self.critic_local_path = os.path.join(
            model_path, "{}_critic_local.pt".format(self.agent_name))
        self.critic_local_2_path = os.path.join(
            model_path, "{}_critic_local_2.pt".format(self.agent_name))
        self.actor_local_path = os.path.join(
            model_path, "{}_actor_local.pt".format(self.agent_name))
        if self.config.load_model: self.locally_load_policy()
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed)
        self.actor_target = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Actor")

        if self.config.load_model: self.locally_load_policy()
        Base_Agent.copy_model_over(self.actor_local, self.actor_target)

        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = OU_Noise_Exploration(self.config)
    def __init__(self,
                 config,
                 global_action_id_to_primitive_action,
                 end_of_episode_symbol="/"):
        super().__init__(config)
        self.state_size += 1

        self.q_network_local = self.create_NN(input_dim=self.state_size,
                                              output_dim=self.action_size)
        self.q_network_optimizer = optim.Adam(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"])
        self.q_network_target = self.create_NN(input_dim=self.state_size,
                                               output_dim=self.action_size)
        Base_Agent.copy_model_over(from_model=self.q_network_local,
                                   to_model=self.q_network_target)

        self.min_episode_score_seen = float("inf")
        self.end_of_episode_symbol = end_of_episode_symbol
        self.global_action_id_to_primitive_action = global_action_id_to_primitive_action
        self.action_id_to_stepping_stone_action_id = {}
        self.calculate_q_values_as_increments = self.config.hyperparameters[
            "calculate_q_values_as_increments"]
        self.abandon_ship = self.config.hyperparameters["abandon_ship"]
        self.pre_training_learning_iterations_multiplier = self.hyperparameters[
            "pre_training_learning_iterations_multiplier"]
        self.copy_over_hidden_layers = self.hyperparameters[
            "copy_over_hidden_layers"]
        self.action_balanced_replay_buffer = self.hyperparameters[
            "action_balanced_replay_buffer"]
        self.original_primitive_actions = list(range(self.action_size))
        self.memory_shaper = Memory_Shaper(
            self.hyperparameters["buffer_size"],
            self.hyperparameters["batch_size"], config.seed,
            self.update_reward_to_encourage_longer_macro_actions,
            self.action_balanced_replay_buffer)
        self.action_length_reward_bonus = self.hyperparameters[
            "action_length_reward_bonus"]
        self.only_train_new_actions = self.hyperparameters[
            "only_train_new_actions"]
        self.only_train_final_layer = self.hyperparameters[
            "only_train_final_layer"]
 def change_final_layer_q_network(self, copy_over_hidden_layers):
     """Completely changes the final layer of the q network to accomodate the new action space"""
     print("Completely changing final layer")
     assert len(self.q_network_local.output_layers) == 1
     if copy_over_hidden_layers:
         self.q_network_local.output_layers[0] = nn.Linear(
             in_features=self.q_network_local.output_layers[0].in_features,
             out_features=self.action_size)
         self.q_network_target.output_layers[0] = nn.Linear(
             in_features=self.q_network_target.output_layers[0].in_features,
             out_features=self.action_size)
     else:
         self.q_network_local = self.create_NN(input_dim=self.state_size,
                                               output_dim=self.action_size)
         self.q_network_target = self.create_NN(input_dim=self.state_size,
                                                output_dim=self.action_size)
     Base_Agent.copy_model_over(from_model=self.q_network_local,
                                to_model=self.q_network_target)
     self.q_network_optimizer = optim.Adam(
         self.q_network_local.parameters(),
         lr=self.hyperparameters["learning_rate"])
    def __init__(self, config, agent_name_=agent_name):
        Base_Agent.__init__(self, config, agent_name_=agent_name_)
        assert self.action_types == "CONTINUOUS", "Action types must be continuous. Use SAC Discrete instead for discrete actions"
        assert self.config.hyperparameters["Actor"]["final_layer_activation"] != "Softmax", "Final actor layer must not be softmax"
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
        self.critic_local_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
                                           key_to_use="Critic", override_seed=self.config.seed + 1)
        self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(),
                                                 lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
        self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(),
                                                   lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
        self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
                                           key_to_use="Critic")
        self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
                                            key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)
        Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
        self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
                                    self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size * 2, key_to_use="Actor")
        self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
                                          lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
        self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"]
        if self.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(torch.Tensor(self.environment.action_space.shape).to(self.device)).item() # heuristic value from the paper
            self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
        else:
            self.alpha = self.hyperparameters["entropy_term_weight"]

        self.add_extra_noise = self.hyperparameters["add_extra_noise"]
        if self.add_extra_noise:
            self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"],
                                  self.hyperparameters["theta"], self.hyperparameters["sigma"])

        self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"]

        self.wandb_watch(self.actor_local, log_freq=self.config.wandb_model_log_freq)
 def __init__(self, config, agent_name_=agent_name):
     DQN.__init__(self, config, agent_name_=agent_name_)
     self.q_network_target = self.create_NN(input_dim=self.state_size,
                                            output_dim=self.action_size)
     Base_Agent.copy_model_over(from_model=self.q_network_local,
                                to_model=self.q_network_target)
Exemple #17
0
    def __init__(self, config, agent_name_=agent_name):
        Base_Agent.__init__(self, config, agent_name_=agent_name_)
        assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions"
        assert self.config.hyperparameters["Actor"][
            "final_layer_activation"] == "Softmax", "Final actor layer must be softmax"
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Critic")
        self.critic_local_2 = self.create_NN(input_dim=self.state_size,
                                             output_dim=self.action_size,
                                             key_to_use="Critic",
                                             override_seed=self.config.seed +
                                             1)
        self.critic_optimizer = torch.optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.critic_optimizer_2 = torch.optim.Adam(
            self.critic_local_2.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.critic_target = self.create_NN(input_dim=self.state_size,
                                            output_dim=self.action_size,
                                            key_to_use="Critic")
        self.critic_target_2 = self.create_NN(input_dim=self.state_size,
                                              output_dim=self.action_size,
                                              key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)
        Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"],
            self.config.seed,
            device=self.device)

        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.actor_optimizer = torch.optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.automatic_entropy_tuning = self.hyperparameters[
            "automatically_tune_entropy_hyperparameter"]
        if self.automatic_entropy_tuning:
            # we set the max possible entropy as the target entropy
            self.target_entropy = -np.log((1.0 / self.action_size)) * 0.98
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optim = Adam(
                [self.log_alpha],
                lr=self.hyperparameters["Actor"]["learning_rate"],
                eps=1e-4)
        else:
            self.alpha = self.hyperparameters["entropy_term_weight"]
        assert not self.hyperparameters[
            "add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment"
        self.add_extra_noise = False
        self.do_evaluation_iterations = self.hyperparameters[
            "do_evaluation_iterations"]

        self.wandb_watch(self.actor_local,
                         log_freq=self.config.wandb_model_log_freq)
Exemple #18
0
 def __init__(self, config):
     DQN.__init__(self, config)
     self.q_network_target = Policy(self.state_size,
                                    self.action_size).to("cuda")
     Base_Agent.copy_model_over(from_model=self.q_network_local,
                                to_model=self.q_network_target)