Example #1
0
    def __init__(self,
                 config,
                 global_action_id_to_primitive_actions,
                 action_length_reward_bonus,
                 end_of_episode_symbol="/"):
        super().__init__(config)
        self.end_of_episode_symbol = end_of_episode_symbol
        self.global_action_id_to_primitive_actions = global_action_id_to_primitive_actions
        self.memory = Replay_Buffer(self.hyperparameters["buffer_size"],
                                    self.hyperparameters["batch_size"],
                                    config.seed)
        self.exploration_strategy = Epsilon_Greedy_Exploration(config)

        self.oracle = self.create_oracle()
        self.oracle_optimizer = optim.Adam(
            self.oracle.parameters(), lr=self.hyperparameters["learning_rate"])

        self.q_network_local = self.create_NN(input_dim=self.state_size + 1,
                                              output_dim=self.action_size)
        self.q_network_local.print_model_summary()
        self.q_network_optimizer = optim.Adam(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"])
        self.q_network_target = self.create_NN(input_dim=self.state_size + 1,
                                               output_dim=self.action_size)
        Base_Agent.copy_model_over(from_model=self.q_network_local,
                                   to_model=self.q_network_target)

        self.action_length_reward_bonus = action_length_reward_bonus
        self.abandon_ship = config.hyperparameters["abandon_ship"]
 def __init__(self, config):
     Base_Agent.__init__(self, config)
     self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed)
     self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size)
     self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(),
                                           lr=self.hyperparameters["learning_rate"])
     self.exploration_strategy = Epsilon_Greedy_Exploration(config)
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        base_config.no_render_mode = False  ## must be render mode

        self.q_network_local = q_network_2_EYE(n_action=self.get_action_size())
        self.q_network_target = q_network_2_EYE(
            n_action=self.get_action_size())
        self.q_network_optimizer = optim.SGD(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"],
            weight_decay=5e-4)

        self.memory = Replay_Buffer(self.hyperparameters["buffer_size"],
                                    self.hyperparameters["batch_size"],
                                    config.seed)
        self.exploration_strategy = Epsilon_Greedy_Exploration(config)

        if config.backbone_pretrain:
            self.load_pretrain()

        self.copy_model_over(from_model=self.q_network_local,
                             to_model=self.q_network_target)

        self.q_network_local.to(self.q_network_local.device)
        self.q_network_target.to(self.q_network_target.device)
Example #4
0
    def run(self):
        """Starts the worker"""
        for ep_ix in range(self.episodes_to_run):
            with self.optimizer_lock:
                Base_Agent.copy_model_over(self.shared_model, self.local_model)
            epsilon_exploration = self.calculate_new_exploration()
            state = self.reset_game_for_worker()
            done = False
            self.episode_states = []
            self.episode_actions = []
            self.episode_rewards = []
            self.episode_log_action_probabilities = []
            self.critic_outputs = []

            while not done:
                action, action_log_prob, critic_outputs = self.pick_action_and_get_critic_values(
                    self.local_model, state, epsilon_exploration)
                next_state, reward, done, _ = self.environment.step(action)
                self.episode_states.append(state)
                self.episode_actions.append(action)
                self.episode_rewards.append(reward)
                self.episode_log_action_probabilities.append(action_log_prob)
                self.critic_outputs.append(critic_outputs)
                state = next_state

            total_loss = self.calculate_total_loss()
            self.put_gradients_in_queue(total_loss)
            self.episode_number += 1
            with self.counter.get_lock():
                self.counter.value += 1
                self.results_queue.put(np.sum(self.episode_rewards))
 def __init__(self, config):
     Base_Agent.__init__(self, config)
     self.policy_output_size = self.calculate_policy_output_size()
     self.policy_new = self.create_NN(input_dim=self.state_size,
                                      output_dim=self.policy_output_size)
     model_path = self.config.model_path if self.config.model_path else 'Models'
     self.policy_new_path = os.path.join(
         model_path, "{}_policy_new.pt".format(self.agent_name))
     if self.config.load_model: self.locally_load_policy()
     self.policy_old = self.create_NN(input_dim=self.state_size,
                                      output_dim=self.policy_output_size)
     self.policy_old.load_state_dict(
         copy.deepcopy(self.policy_new.state_dict()))
     self.policy_new_optimizer = optim.Adam(
         self.policy_new.parameters(),
         lr=self.hyperparameters["learning_rate"],
         eps=1e-4)
     self.episode_number = 0
     self.many_episode_states = []
     self.many_episode_actions = []
     self.many_episode_rewards = []
     self.experience_generator = Parallel_Experience_Generator(
         self.environment, self.policy_new, self.config.seed,
         self.hyperparameters, self.action_size)
     self.exploration_strategy = Epsilon_Greedy_Exploration(self.config)
    def __init__(self, config, agent_name_=agent_name):
        DDQN.__init__(self, config, agent_name_=agent_name_)
        self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
        self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
        self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
        Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)

        self.wandb_watch(self.q_network_local, log_freq=self.config.wandb_model_log_freq)
Example #7
0
 def __init__(self, config):
     Base_Agent.__init__(self, config)
     self.policy = self.create_NN(
         input_dim=self.state_size, output_dim=self.action_size)
     self.optimizer = optim.Adam(
         self.policy.parameters(), lr=self.hyperparameters["learning_rate"])
     self.episode_rewards = []
     self.episode_log_probabilities = []
    def __init__(self, config):
        DQN.__init__(self, config)
        self.q_network_target = self.create_NN(input_dim=self.state_size,
                                               output_dim=self.action_size)
        Base_Agent.copy_model_over(from_model=self.q_network_local,
                                   to_model=self.q_network_target)

        if config.resume:
            self.load_resume(config.resume_path)
Example #9
0
    def __init__(self, config):
        DDQN.__init__(self, config)

        model_path = self.config.model_path if self.config.model_path else 'Models'
        self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
        self.q_network_local_path = os.path.join(model_path, "{}_q_network_local.pt".format(self.agent_name))

        if self.config.load_model: self.locally_load_policy()
        self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
        self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
        Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
 def __init__(self, config):
     DDQN.__init__(self, config)
     self.q_network_local = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size + 1)
     self.q_network_optimizer = optim.Adam(
         self.q_network_local.parameters(),
         lr=self.hyperparameters["learning_rate"])
     self.q_network_target = self.create_NN(input_dim=self.state_size,
                                            output_dim=self.action_size + 1)
     Base_Agent.copy_model_over(from_model=self.q_network_local,
                                to_model=self.q_network_target)
Example #11
0
 def __init__(self, config):
     Base_Agent.__init__(self, config)
     self.memory = Replay_Buffer(self.hyperparameters["buffer_size"],
                                 self.hyperparameters["batch_size"],
                                 config.seed)
     self.q_network_local = Policy(self.state_size,
                                   self.action_size).to("cuda")
     self.q_network_optimizer = optim.Adam(
         self.q_network_local.parameters(),
         lr=self.hyperparameters["learning_rate"],
         eps=1e-4)
     self.exploration_strategy = Epsilon_Greedy_Exploration(config)
 def __init__(self, config):
     Base_Agent.__init__(self, config)
     self.policy_output_size = self.calculate_policy_output_size()
     self.policy_new = self.create_NN(input_dim=self.state_size, output_dim=self.policy_output_size)
     self.policy_old = self.create_NN(input_dim=self.state_size, output_dim=self.policy_output_size)
     self.policy_old.load_state_dict(copy.deepcopy(self.policy_new.state_dict()))
     self.policy_new_optimizer = optim.Adam(self.policy_new.parameters(), lr=self.hyperparameters["learning_rate"])
     self.episode_number = 0
     self.many_episode_states = []
     self.many_episode_actions = []
     self.many_episode_rewards = []
     self.experience_generator = Parallel_Experience_Generator(self.environment, self.policy_new, self.config.seed,
                                                               self.hyperparameters, self.action_size)
     self.exploration_strategy = Epsilon_Greedy_Exploration(self.config)
 def __init__(self, config):
     DDPG.__init__(self, config)
     self.critic_local_2 = self.create_NN(
         input_dim=self.state_size + self.action_size,
         output_dim=1,
         key_to_use="Critic",
         override_seed=self.config.seed + 1)
     self.critic_target_2 = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
     Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
     self.critic_optimizer_2 = optim.Adam(
         self.critic_local_2.parameters(),
         lr=self.hyperparameters["Critic"]["learning_rate"])
     self.exploration_strategy_critic = Gaussian_Exploration(self.config)
    def __init__(self, config, agent_name_=agent_name):
        Base_Agent.__init__(self, config, agent_name=agent_name_)
        self.memory = Replay_Buffer(self.hyperparameters["buffer_size"],
                                    self.hyperparameters["batch_size"],
                                    config.seed, self.device)
        self.q_network_local = self.create_NN(
            input_dim=self.state_size,
            output_dim=self.action_size)  # TODO: Change NN
        self.q_network_optimizer = optim.Adam(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = Epsilon_Greedy_Exploration(config)

        self.wandb_watch(self.q_network_local,
                         log_freq=self.config.wandb_model_log_freq)
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.controller_config = copy.deepcopy(config)
        self.controller_config.hyperparameters = self.controller_config.hyperparameters[
            "CONTROLLER"]
        self.controller = DDQN(self.controller_config)
        self.controller.q_network_local = self.create_NN(
            input_dim=self.state_size * 2,
            output_dim=self.action_size,
            key_to_use="CONTROLLER")
        self.controller.q_network_target = self.create_NN(
            input_dim=self.state_size * 2,
            output_dim=self.action_size,
            key_to_use="CONTROLLER")

        self.meta_controller_config = copy.deepcopy(config)
        self.meta_controller_config.hyperparameters = self.meta_controller_config.hyperparameters[
            "META_CONTROLLER"]

        # self.meta_controller = DDQN(self.meta_controller_config)
        # self.meta_controller.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=config.environment.observation_space.n,
        #                                                       key_to_use="META_CONTROLLER")
        # self.meta_controller.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=config.environment.observation_space.n,
        #                                                       key_to_use="META_CONTROLLER")

        self.list_meta_controller = [
            DDQN(self.meta_controller_config) for _ in range(5)
        ]
        self.lq_network_local = []
        self.lq_network_target = []
        for m in self.list_meta_controller:
            m.q_network_local = self.create_NN(
                input_dim=self.state_size,
                output_dim=config.environment.observation_space.n,
                key_to_use="META_CONTROLLER")
            self.lq_network_local.append(m.q_network_local)
            m.q_network_target = self.create_NN(
                input_dim=self.state_size,
                output_dim=config.environment.observation_space.n,
                key_to_use="META_CONTROLLER")
            self.lq_network_target.append(m.q_network_target)

        self.rolling_intrinsic_rewards = []
        self.goals_seen = []
        self.controller_learnt_enough = False
        self.controller_actions = []
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        model_path = self.config.model_path if self.config.model_path else 'Models'
        self.memory = Replay_Buffer(self.hyperparameters["buffer_size"],
                                    self.hyperparameters["batch_size"],
                                    config.seed)
        self.q_network_local = self.create_NN(input_dim=self.state_size,
                                              output_dim=self.action_size)
        self.q_network_local_path = os.path.join(
            model_path, "{}_q_network_local.pt".format(self.agent_name))

        if self.config.load_model: self.locally_load_policy()
        self.q_network_optimizer = optim.Adam(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = Epsilon_Greedy_Exploration(config)
 def append_to_final_layers(self, num_new_actions):
     """Appends to the end of a network to allow it to choose from the new actions. It does not change the weights
     for the other actions"""
     print("Appending options to final layer")
     assert num_new_actions > 0
     self.q_network_local.output_layers.append(
         nn.Linear(
             in_features=self.q_network_local.output_layers[0].in_features,
             out_features=num_new_actions))
     self.q_network_target.output_layers.append(
         nn.Linear(
             in_features=self.q_network_local.output_layers[0].in_features,
             out_features=num_new_actions))
     Base_Agent.copy_model_over(from_model=self.q_network_local,
                                to_model=self.q_network_target)
     self.q_network_optimizer = optim.Adam(
         self.q_network_local.parameters(),
         lr=self.hyperparameters["learning_rate"])
Example #18
0
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        assert isinstance(self.environment.reset(), int) or isinstance(self.environment.reset(
        ), np.int64) or self.environment.reset().dtype == np.int64, "only works for discrete states currently"
        self.num_skills = self.hyperparameters["SKILL_AGENT"]["num_skills"]
        self.episodes_for_pretraining = self.hyperparameters[
            "SKILL_AGENT"]["episodes_for_pretraining"]
        self.timesteps_before_changing_skill = self.hyperparameters[
            "MANAGER"]["timesteps_before_changing_skill"]

        self.skill_agent_config = copy.deepcopy(config)
        self.skill_agent_config.hyperparameters = self.skill_agent_config.hyperparameters[
            "SKILL_AGENT"]
        self.skill_agent_config.num_episodes_to_run = self.episodes_for_pretraining

        self.manager_config = copy.deepcopy(config)
        self.manager_config.hyperparameters = self.manager_config.hyperparameters["MANAGER"]
        self.manager_config.num_episodes_to_run = self.config.num_episodes_to_run - \
            self.skill_agent_config.num_episodes_to_run
    def __init__(self,
                 config,
                 global_action_id_to_primitive_action,
                 end_of_episode_symbol="/"):
        super().__init__(config)
        self.state_size += 1

        self.q_network_local = self.create_NN(input_dim=self.state_size,
                                              output_dim=self.action_size)
        self.q_network_optimizer = optim.Adam(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"])
        self.q_network_target = self.create_NN(input_dim=self.state_size,
                                               output_dim=self.action_size)
        Base_Agent.copy_model_over(from_model=self.q_network_local,
                                   to_model=self.q_network_target)

        self.min_episode_score_seen = float("inf")
        self.end_of_episode_symbol = end_of_episode_symbol
        self.global_action_id_to_primitive_action = global_action_id_to_primitive_action
        self.action_id_to_stepping_stone_action_id = {}
        self.calculate_q_values_as_increments = self.config.hyperparameters[
            "calculate_q_values_as_increments"]
        self.abandon_ship = self.config.hyperparameters["abandon_ship"]
        self.pre_training_learning_iterations_multiplier = self.hyperparameters[
            "pre_training_learning_iterations_multiplier"]
        self.copy_over_hidden_layers = self.hyperparameters[
            "copy_over_hidden_layers"]
        self.action_balanced_replay_buffer = self.hyperparameters[
            "action_balanced_replay_buffer"]
        self.original_primitive_actions = list(range(self.action_size))
        self.memory_shaper = Memory_Shaper(
            self.hyperparameters["buffer_size"],
            self.hyperparameters["batch_size"], config.seed,
            self.update_reward_to_encourage_longer_macro_actions,
            self.action_balanced_replay_buffer)
        self.action_length_reward_bonus = self.hyperparameters[
            "action_length_reward_bonus"]
        self.only_train_new_actions = self.hyperparameters[
            "only_train_new_actions"]
        self.only_train_final_layer = self.hyperparameters[
            "only_train_final_layer"]
 def change_final_layer_q_network(self, copy_over_hidden_layers):
     """Completely changes the final layer of the q network to accomodate the new action space"""
     print("Completely changing final layer")
     assert len(self.q_network_local.output_layers) == 1
     if copy_over_hidden_layers:
         self.q_network_local.output_layers[0] = nn.Linear(
             in_features=self.q_network_local.output_layers[0].in_features,
             out_features=self.action_size)
         self.q_network_target.output_layers[0] = nn.Linear(
             in_features=self.q_network_target.output_layers[0].in_features,
             out_features=self.action_size)
     else:
         self.q_network_local = self.create_NN(input_dim=self.state_size,
                                               output_dim=self.action_size)
         self.q_network_target = self.create_NN(input_dim=self.state_size,
                                                output_dim=self.action_size)
     Base_Agent.copy_model_over(from_model=self.q_network_local,
                                to_model=self.q_network_target)
     self.q_network_optimizer = optim.Adam(
         self.q_network_local.parameters(),
         lr=self.hyperparameters["learning_rate"])
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions"
        assert self.config.hyperparameters["Actor"]["final_layer_activation"] == "Softmax", "Final actor layer must be softmax"
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic")
        self.critic_local_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
                                           key_to_use="Critic", override_seed=self.config.seed + 1)
        self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(),
                                                 lr=self.hyperparameters["Critic"]["learning_rate"])
        self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(),
                                                   lr=self.hyperparameters["Critic"]["learning_rate"])
        self.critic_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
                                           key_to_use="Critic")
        self.critic_target_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
                                            key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)
        Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
        self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
                                    self.config.seed)

        self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
        self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
                                          lr=self.hyperparameters["Actor"]["learning_rate"])
        self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"]
        if self.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(torch.Tensor(self.environment.action_space.shape).to(self.device)).item() # heuristic value from the paper
            self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"])
        else:
            self.alpha = self.hyperparameters["entropy_term_weight"]
        assert not self.hyperparameters["add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment"
        self.add_extra_noise = False
        self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"]
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.actor_target = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Actor")
        Base_Agent.copy_model_over(self.actor_local, self.actor_target)

        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = OU_Noise_Exploration(self.config)
Example #23
0
 def __init__(self, config):
     Base_Agent.__init__(self, config)
     self.controller_config = copy.deepcopy(config)
     self.controller_config.hyperparameters = self.controller_config.hyperparameters[
         "CONTROLLER"]
     self.controller = DDQN(self.controller_config)
     self.controller.q_network_local = self.create_NN(
         input_dim=self.state_size * 2,
         output_dim=self.action_size,
         key_to_use="CONTROLLER")
     self.meta_controller_config = copy.deepcopy(config)
     self.meta_controller_config.hyperparameters = self.meta_controller_config.hyperparameters[
         "META_CONTROLLER"]
     self.meta_controller = DDQN(self.meta_controller_config)
     self.meta_controller.q_network_local = self.create_NN(
         input_dim=self.state_size,
         output_dim=config.environment.observation_space.n,
         key_to_use="META_CONTROLLER")
     self.rolling_intrinsic_rewards = []
     self.goals_seen = []
     self.controller_learnt_enough = False
     self.controller_actions = []
    def __init__(self, config, agent_name_=agent_name):
        Base_Agent.__init__(self, config, agent_name=agent_name_)

        self.memory = Replay_Buffer(self.hyperparameters["buffer_size"],
                                    self.hyperparameters["batch_size"],
                                    config.seed, self.device)

        # If model is not provided, create one. TODO Add this mechanism to all agents.
        if not "model" in self.hyperparameters or self.hyperparameters[
                "model"] is None:
            self.q_network_local = self.create_NN(input_dim=self.state_size,
                                                  output_dim=self.action_size)
        else:
            self.q_network_local = self.hyperparameters["model"]

        self.wandb_watch(self.q_network_local,
                         log_freq=self.config.wandb_model_log_freq)

        self.q_network_optimizer = optim.Adam(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = Epsilon_Greedy_Exploration(config)
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.actor_target = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Actor")
        Base_Agent.copy_model_over(self.actor_local, self.actor_target)

        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = OU_Noise_Exploration(self.config)

        if self.video_mode:
            self.file_name = self.environment_title + "_" + self.agent_name + "_videos"
            for i in range(config.num_episodes_to_run):
                pathset = os.path.join(self.file_name)
                if not (os.path.exists(pathset)):
                    os.mkdir(pathset)
            # f = tables.open_file(self.file_name, mode = 'w')
            # f.close()
            # datainfo = "DDPG_"+ self.environment_title + "_info.txt"
            # f = open(self.file_name, 'w')
            # f.close()
            # f = open(datainfo, 'w')
            # f.write(str(self.height))
            # f.write(str(self.width))
            # f.write(str(self.channel))
            # f.write(str(config.max_step))
            # f.write(str(config.num_episodes_to_run))
            # f.close()
        self.save_max_result_list_list = []
    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        model_path = self.config.model_path if self.config.model_path else 'Models'
        self.critic_local_path = os.path.join(
            model_path, "{}_critic_local.pt".format(self.agent_name))
        self.critic_local_2_path = os.path.join(
            model_path, "{}_critic_local_2.pt".format(self.agent_name))
        self.actor_local_path = os.path.join(
            model_path, "{}_actor_local.pt".format(self.agent_name))
        if self.config.load_model: self.locally_load_policy()
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed)
        self.actor_target = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Actor")

        if self.config.load_model: self.locally_load_policy()
        Base_Agent.copy_model_over(self.actor_local, self.actor_target)

        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = OU_Noise_Exploration(self.config)
    def __init__(self, config, agent_name_=agent_name):
        Base_Agent.__init__(self, config, agent_name_=agent_name_)
        assert self.action_types == "CONTINUOUS", "Action types must be continuous. Use SAC Discrete instead for discrete actions"
        assert self.config.hyperparameters["Actor"]["final_layer_activation"] != "Softmax", "Final actor layer must not be softmax"
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
        self.critic_local_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
                                           key_to_use="Critic", override_seed=self.config.seed + 1)
        self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(),
                                                 lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
        self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(),
                                                   lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
        self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
                                           key_to_use="Critic")
        self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
                                            key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)
        Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
        self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
                                    self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size * 2, key_to_use="Actor")
        self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
                                          lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
        self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"]
        if self.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(torch.Tensor(self.environment.action_space.shape).to(self.device)).item() # heuristic value from the paper
            self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
        else:
            self.alpha = self.hyperparameters["entropy_term_weight"]

        self.add_extra_noise = self.hyperparameters["add_extra_noise"]
        if self.add_extra_noise:
            self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"],
                                  self.hyperparameters["theta"], self.hyperparameters["sigma"])

        self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"]

        self.wandb_watch(self.actor_local, log_freq=self.config.wandb_model_log_freq)
 def reset_game(self):
     """Resets the game information so we are ready to play a new episode"""
     Base_Agent.reset_game(self)
     if self.add_extra_noise: self.noise.reset()
Example #29
0
    def __init__(self, config, agent_name_=agent_name):
        Base_Agent.__init__(self, config, agent_name_=agent_name_)
        assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions"
        assert self.config.hyperparameters["Actor"][
            "final_layer_activation"] == "Softmax", "Final actor layer must be softmax"
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Critic")
        self.critic_local_2 = self.create_NN(input_dim=self.state_size,
                                             output_dim=self.action_size,
                                             key_to_use="Critic",
                                             override_seed=self.config.seed +
                                             1)
        self.critic_optimizer = torch.optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.critic_optimizer_2 = torch.optim.Adam(
            self.critic_local_2.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.critic_target = self.create_NN(input_dim=self.state_size,
                                            output_dim=self.action_size,
                                            key_to_use="Critic")
        self.critic_target_2 = self.create_NN(input_dim=self.state_size,
                                              output_dim=self.action_size,
                                              key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)
        Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"],
            self.config.seed,
            device=self.device)

        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.actor_optimizer = torch.optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.automatic_entropy_tuning = self.hyperparameters[
            "automatically_tune_entropy_hyperparameter"]
        if self.automatic_entropy_tuning:
            # we set the max possible entropy as the target entropy
            self.target_entropy = -np.log((1.0 / self.action_size)) * 0.98
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optim = Adam(
                [self.log_alpha],
                lr=self.hyperparameters["Actor"]["learning_rate"],
                eps=1e-4)
        else:
            self.alpha = self.hyperparameters["entropy_term_weight"]
        assert not self.hyperparameters[
            "add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment"
        self.add_extra_noise = False
        self.do_evaluation_iterations = self.hyperparameters[
            "do_evaluation_iterations"]

        self.wandb_watch(self.actor_local,
                         log_freq=self.config.wandb_model_log_freq)
 def __init__(self, config, agent_name_=agent_name):
     DQN.__init__(self, config, agent_name_=agent_name_)
     self.q_network_target = self.create_NN(input_dim=self.state_size,
                                            output_dim=self.action_size)
     Base_Agent.copy_model_over(from_model=self.q_network_local,
                                to_model=self.q_network_target)