class DQN(Base_Agent):
    """A deep Q learning agent"""
    agent_name = "DQN"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        model_path = self.config.model_path if self.config.model_path else 'Models'
        self.memory = Replay_Buffer(self.hyperparameters["buffer_size"],
                                    self.hyperparameters["batch_size"],
                                    config.seed)
        self.q_network_local = self.create_NN(input_dim=self.state_size,
                                              output_dim=self.action_size)
        self.q_network_local_path = os.path.join(
            model_path, "{}_q_network_local.pt".format(self.agent_name))

        if self.config.load_model: self.locally_load_policy()
        self.q_network_optimizer = optim.Adam(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = Epsilon_Greedy_Exploration(config)

    def reset_game(self):
        super(DQN, self).reset_game()
        self.update_learning_rate(self.hyperparameters["learning_rate"],
                                  self.q_network_optimizer)

    def step(self):
        """Runs a step within a game including a learning step if required"""
        while not self.done:
            self.action = self.pick_action()
            self.conduct_action(self.action)
            if self.time_for_q_network_to_learn():
                for _ in range(self.hyperparameters["learning_iterations"]):
                    self.learn()
            self.save_experience()
            self.state = self.next_state  #this is to set the state for the next iteration
            self.global_step_number += 1
        self.episode_number += 1

    def pick_action(self, state=None):
        """Uses the local Q network and an epsilon greedy policy to pick an action"""
        # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add
        # a "fake" dimension to make it a mini-batch rather than a single observation
        if state is None: state = self.state
        if isinstance(state, np.int64) or isinstance(state, int):
            state = np.array([state])
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        if len(state.shape) < 2: state = state.unsqueeze(0)
        self.q_network_local.eval()  #puts network in evaluation mode
        with torch.no_grad():
            action_values = self.q_network_local(state)
        self.q_network_local.train()  #puts network back in training mode
        action = self.exploration_strategy.perturb_action_for_exploration_purposes(
            {
                "action_values": action_values,
                "turn_off_exploration": self.turn_off_exploration,
                "episode_number": self.episode_number
            })
        self.logger.info("Q values {} -- Action chosen {}".format(
            action_values, action))
        return action

    def learn(self, experiences=None):
        """Runs a learning iteration for the Q network"""
        if experiences is None:
            states, actions, rewards, next_states, dones = self.sample_experiences(
            )  #Sample experiences
        else:
            states, actions, rewards, next_states, dones = experiences
        loss = self.compute_loss(states, next_states, rewards, actions, dones)

        actions_list = [action_X.item() for action_X in actions]

        self.logger.info("Action counts {}".format(Counter(actions_list)))
        self.take_optimisation_step(
            self.q_network_optimizer, self.q_network_local, loss,
            self.hyperparameters["gradient_clipping_norm"])

    def compute_loss(self, states, next_states, rewards, actions, dones):
        """Computes the loss required to train the Q network"""
        with torch.no_grad():
            Q_targets = self.compute_q_targets(next_states, rewards, dones)
        Q_expected = self.compute_expected_q_values(states, actions)
        loss = F.mse_loss(Q_expected, Q_targets)
        return loss

    def compute_q_targets(self, next_states, rewards, dones):
        """Computes the q_targets we will compare to predicted q values to create the loss to train the Q network"""
        Q_targets_next = self.compute_q_values_for_next_states(next_states)
        Q_targets = self.compute_q_values_for_current_states(
            rewards, Q_targets_next, dones)
        return Q_targets

    def compute_q_values_for_next_states(self, next_states):
        """Computes the q_values for next state we will use to create the loss to train the Q network"""
        Q_targets_next = self.q_network_local(next_states).detach().max(
            1)[0].unsqueeze(1)
        return Q_targets_next

    def compute_q_values_for_current_states(self, rewards, Q_targets_next,
                                            dones):
        """Computes the q_values for current state we will use to create the loss to train the Q network"""
        Q_targets_current = rewards + (self.hyperparameters["discount_rate"] *
                                       Q_targets_next * (1 - dones))
        return Q_targets_current

    def compute_expected_q_values(self, states, actions):
        """Computes the expected q_values we will use to create the loss to train the Q network"""
        Q_expected = self.q_network_local(states).gather(1, actions.long(
        ))  #must convert actions to long so can be used as index
        return Q_expected

    def time_for_q_network_to_learn(self):
        """Returns boolean indicating whether enough steps have been taken for learning to begin and there are
        enough experiences in the replay buffer to learn from"""
        return self.right_amount_of_steps_taken(
        ) and self.enough_experiences_to_learn_from()

    def right_amount_of_steps_taken(self):
        """Returns boolean indicating whether enough steps have been taken for learning to begin"""
        return self.global_step_number % self.hyperparameters[
            "update_every_n_steps"] == 0

    def sample_experiences(self):
        """Draws a random sample of experience from the memory buffer"""
        experiences = self.memory.sample()
        states, actions, rewards, next_states, dones = experiences
        return states, actions, rewards, next_states, dones

    def locally_save_policy(self):
        """Saves the policy"""
        """保存策略,待添加"""
        torch.save(self.q_network_local.state_dict(),
                   self.q_network_local_path)

    def locally_load_policy(self):
        print("locall_load_policy")
        if os.path.isfile(self.q_network_local_path):
            try:
                self.q_network_local.load_state_dict(
                    torch.load(self.q_network_local_path))
                print("load critic_local_path")
            except:
                pass
Beispiel #2
0
class HER_Base(object):
    """Contains methods needed to turn an algorithm into a hindsight experience replay (HER) algorithm"""
    def __init__(self, buffer_size, batch_size, HER_sample_proportion):
        self.HER_memory = Replay_Buffer(buffer_size, batch_size,
                                        self.config.seed)
        self.ordinary_buffer_batch_size = int(batch_size *
                                              (1.0 - HER_sample_proportion))
        self.HER_buffer_batch_size = batch_size - self.ordinary_buffer_batch_size

    def reset_game(self):
        """Resets the game information so we are ready to play a new episode"""
        self.state_dict = self.environment.reset()
        self.observation = self.state_dict["observation"]
        self.desired_goal = self.state_dict["desired_goal"]
        self.achieved_goal = self.state_dict["achieved_goal"]

        self.state = self.create_state_from_observation_and_desired_goal(
            self.observation, self.desired_goal)
        self.next_state = None
        self.action = None
        self.reward = None
        self.done = False

        self.episode_states = []
        self.episode_rewards = []
        self.episode_actions = []
        self.episode_next_states = []
        self.episode_dones = []

        self.episode_desired_goals = []
        self.episode_achieved_goals = []
        self.episode_observations = []

        self.episode_next_desired_goals = []
        self.episode_next_achieved_goals = []
        self.episode_next_observations = []

        self.total_episode_score_so_far = 0

    def track_changeable_goal_episodes_data(self):
        """Saves the data from the recent episodes in a way compatible with changeable goal environments"""
        self.episode_rewards.append(self.reward)
        self.episode_actions.append(self.action)
        self.episode_dones.append(self.done)

        self.episode_states.append(self.state)
        self.episode_next_states.append(self.next_state)

        self.episode_desired_goals.append(self.state_dict["desired_goal"])
        self.episode_achieved_goals.append(self.state_dict["achieved_goal"])
        self.episode_observations.append(self.state_dict["observation"])

        self.episode_next_desired_goals.append(
            self.next_state_dict["desired_goal"])
        self.episode_next_achieved_goals.append(
            self.next_state_dict["achieved_goal"])
        self.episode_next_observations.append(
            self.next_state_dict["observation"])

    def conduct_action_in_changeable_goal_envs(self, action):
        """Adapts conduct_action from base agent so that can handle changeable goal environments"""
        self.next_state_dict, self.reward, self.done, _ = self.environment.step(
            action)
        self.total_episode_score_so_far += self.reward
        self.reward = max(min(self.reward, 1.0), -1.0)
        self.observation = self.next_state_dict["observation"]
        self.desired_goal = self.next_state_dict["desired_goal"]
        self.achieved_goal = self.next_state_dict["achieved_goal"]
        self.next_state = self.create_state_from_observation_and_desired_goal(
            self.observation, self.desired_goal)

    def create_state_from_observation_and_desired_goal(self, observation,
                                                       desired_goal):
        return np.concatenate((observation, desired_goal))

    def save_alternative_experience(self):
        """Saves the experiences as if the final state visited in the episode was the goal state"""
        new_goal = self.achieved_goal
        new_states = [
            self.create_state_from_observation_and_desired_goal(
                observation, new_goal)
            for observation in self.episode_observations
        ]
        new_next_states = [
            self.create_state_from_observation_and_desired_goal(
                observation, new_goal)
            for observation in self.episode_next_observations
        ]
        new_rewards = [
            self.environment.compute_reward(next_achieved_goal, new_goal, None)
            for next_achieved_goal in self.episode_next_achieved_goals
        ]
        self.HER_memory.add_experience(new_states, self.episode_actions,
                                       new_rewards, new_next_states,
                                       self.episode_dones)

    def sample_from_HER_and_Ordinary_Buffer(self):
        """Samples from the ordinary replay buffer and HER replay buffer according to a proportion specified in config"""
        states, actions, rewards, next_states, dones = self.memory.sample(
            self.ordinary_buffer_batch_size)
        HER_states, HER_actions, HER_rewards, HER_next_states, HER_dones = self.HER_memory.sample(
            self.HER_buffer_batch_size)

        states = torch.cat((states, HER_states))
        actions = torch.cat((actions, HER_actions))
        rewards = torch.cat((rewards, HER_rewards))
        next_states = torch.cat((next_states, HER_next_states))
        dones = torch.cat((dones, HER_dones))
        return states, actions, rewards, next_states, dones
class SAC(Base_Agent):
    """Soft Actor-Critic model based on the 2018 paper https://arxiv.org/abs/1812.05905 and on this github implementation
      https://github.com/pranz24/pytorch-soft-actor-critic. It is an actor-critic algorithm where the agent is also trained
      to maximise the entropy of their actions as well as their cumulative reward"""
    agent_name = "SAC"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        assert self.action_types == "CONTINUOUS", "Action types must be continuous. Use SAC Discrete instead for discrete actions"
        assert self.config.hyperparameters["Actor"][
            "final_layer_activation"] != "Softmax", "Final actor layer must not be softmax"
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.critic_local_2 = self.create_NN(
            input_dim=self.state_size + self.action_size,
            output_dim=1,
            key_to_use="Critic",
            override_seed=self.config.seed + 1)
        self.critic_optimizer = torch.optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.critic_optimizer_2 = torch.optim.Adam(
            self.critic_local_2.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        self.critic_target_2 = self.create_NN(input_dim=self.state_size +
                                              self.action_size,
                                              output_dim=1,
                                              key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)
        Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size * 2,
                                          key_to_use="Actor")
        self.actor_optimizer = torch.optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.automatic_entropy_tuning = self.hyperparameters[
            "automatically_tune_entropy_hyperparameter"]
        if self.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.environment.action_space.shape).to(
                    self.device)).item()  # heuristic value from the paper
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optim = Adam(
                [self.log_alpha],
                lr=self.hyperparameters["Actor"]["learning_rate"],
                eps=1e-4)
        else:
            self.alpha = self.hyperparameters["entropy_term_weight"]

        self.add_extra_noise = self.hyperparameters["add_extra_noise"]
        if self.add_extra_noise:
            self.noise = OU_Noise(self.action_size, self.config.seed,
                                  self.hyperparameters["mu"],
                                  self.hyperparameters["theta"],
                                  self.hyperparameters["sigma"])

        self.do_evaluation_iterations = self.hyperparameters[
            "do_evaluation_iterations"]

    def save_result(self):
        """Saves the result of an episode of the game. Overriding the method in Base Agent that does this because we only
        want to keep track of the results during the evaluation episodes"""
        if self.episode_number == 1 or not self.do_evaluation_iterations:
            self.game_full_episode_scores.extend(
                [self.total_episode_score_so_far])
            self.rolling_results.append(
                np.mean(
                    self.game_full_episode_scores[-1 *
                                                  self.rolling_score_window:]))
            self.save_max_result_seen()

        elif (self.episode_number -
              1) % TRAINING_EPISODES_PER_EVAL_EPISODE == 0:
            self.game_full_episode_scores.extend([
                self.total_episode_score_so_far
                for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE)
            ])
            self.rolling_results.extend([
                np.mean(
                    self.game_full_episode_scores[-1 *
                                                  self.rolling_score_window:])
                for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE)
            ])
            self.save_max_result_seen()

    def reset_game(self):
        """Resets the game information so we are ready to play a new episode"""
        Base_Agent.reset_game(self)
        if self.add_extra_noise: self.noise.reset()

    def step(self):
        """Runs an episode on the game, saving the experience and running a learning step if appropriate"""
        eval_ep = self.episode_number % TRAINING_EPISODES_PER_EVAL_EPISODE == 0 and self.do_evaluation_iterations
        self.episode_step_number_val = 0
        while not self.done:
            self.episode_step_number_val += 1
            self.action = self.pick_action(eval_ep)
            self.conduct_action(self.action)
            if self.time_for_critic_and_actor_to_learn():
                for _ in range(self.hyperparameters[
                        "learning_updates_per_learning_session"]):
                    self.learn()
            mask = False if self.episode_step_number_val >= self.environment._max_episode_steps else self.done
            if not eval_ep:
                self.save_experience(experience=(self.state, self.action,
                                                 self.reward, self.next_state,
                                                 mask))
            self.state = self.next_state
            self.global_step_number += 1
        print(self.total_episode_score_so_far)
        if eval_ep: self.print_summary_of_latest_evaluation_episode()
        self.episode_number += 1

    def pick_action(self, eval_ep, state=None):
        """Picks an action using one of three methods: 1) Randomly if we haven't passed a certain number of steps,
         2) Using the actor in evaluation mode if eval_ep is True  3) Using the actor in training mode if eval_ep is False.
         The difference between evaluation and training mode is that training mode does more exploration"""
        if state is None: state = self.state
        if eval_ep: action = self.actor_pick_action(state=state, eval=True)
        elif self.global_step_number < self.hyperparameters[
                "min_steps_before_learning"]:
            action = self.environment.action_space.sample()
            print("Picking random action ", action)
        else:
            action = self.actor_pick_action(state=state)
        if self.add_extra_noise:
            action += self.noise.sample()
        return action

    def actor_pick_action(self, state=None, eval=False):
        """Uses actor to pick an action in one of two ways: 1) If eval = False and we aren't in eval mode then it picks
        an action that has partly been randomly sampled 2) If eval = True then we pick the action that comes directly
        from the network and so did not involve any random sampling"""
        if state is None: state = self.state
        state = torch.FloatTensor([state]).to(self.device)
        if len(state.shape) == 1: state = state.unsqueeze(0)
        if eval == False:
            action, _, _ = self.produce_action_and_action_info(state)
        else:
            with torch.no_grad():
                _, z, action = self.produce_action_and_action_info(state)
        action = action.detach().cpu().numpy()
        return action[0]

    def pick_best_action(self, state=None):
        """Picks an action using the actor network without noise"""
        if state is None: state = self.state
        state = torch.FloatTensor([state]).to(self.device)
        if len(state.shape) == 1: state = state.unsqueeze(0)
        self.actor_local.eval()
        with torch.no_grad():
            actor_output = self.actor_local(state)
            mean = actor_output[:, :self.action_size]
            action = torch.tanh(mean).detach().cpu().numpy()
        self.actor_local.train()
        return action[0]

    def produce_action_and_action_info(self, state):
        """Given the state, produces an action, the log probability of the action, and the tanh of the mean action"""
        actor_output = self.actor_local(state)
        mean, log_std = actor_output[:, :self.
                                     action_size], actor_output[:, self.
                                                                action_size:]
        std = log_std.exp()
        normal = Normal(mean, std)
        x_t = normal.rsample(
        )  #rsample means it is sampled using reparameterisation trick
        action = torch.tanh(x_t)
        log_prob = normal.log_prob(x_t)
        log_prob -= torch.log(1 - action.pow(2) + EPSILON)
        log_prob = log_prob.sum(1, keepdim=True)
        return action, log_prob, torch.tanh(mean)

    def time_for_critic_and_actor_to_learn(self):
        """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
        actor and critic"""
        return self.global_step_number > self.hyperparameters["min_steps_before_learning"] and \
               self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0

    def learn(self):
        """Runs a learning iteration for the actor, both critics and (if specified) the temperature parameter"""
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.sample_experiences(
        )
        qf1_loss, qf2_loss = self.calculate_critic_losses(
            state_batch, action_batch, reward_batch, next_state_batch,
            mask_batch)
        self.update_critic_parameters(qf1_loss, qf2_loss)

        policy_loss, log_pi = self.calculate_actor_loss(state_batch)
        if self.automatic_entropy_tuning:
            alpha_loss = self.calculate_entropy_tuning_loss(log_pi)
        else:
            alpha_loss = None
        self.update_actor_parameters(policy_loss, alpha_loss)

    def sample_experiences(self):
        return self.memory.sample()

    def calculate_critic_losses(self, state_batch, action_batch, reward_batch,
                                next_state_batch, mask_batch):
        """Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy
         term is taken into account"""
        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.produce_action_and_action_info(
                next_state_batch)
            qf1_next_target = self.critic_target(
                torch.cat((next_state_batch, next_state_action), 1))
            qf2_next_target = self.critic_target_2(
                torch.cat((next_state_batch, next_state_action), 1))
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + (
                1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (
                    min_qf_next_target)
        qf1 = self.critic_local(torch.cat((state_batch, action_batch), 1))
        qf2 = self.critic_local_2(torch.cat((state_batch, action_batch), 1))
        qf1_loss = F.mse_loss(qf1, next_q_value)
        qf2_loss = F.mse_loss(qf2, next_q_value)
        return qf1_loss, qf2_loss

    def calculate_actor_loss(self, state_batch):
        """Calculates the loss for the actor. This loss includes the additional entropy term"""
        action, log_pi, _ = self.produce_action_and_action_info(state_batch)
        qf1_pi = self.critic_local(torch.cat((state_batch, action), 1))
        qf2_pi = self.critic_local_2(torch.cat((state_batch, action), 1))
        min_qf_pi = torch.min(qf1_pi, qf2_pi)
        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean()
        return policy_loss, log_pi

    def calculate_entropy_tuning_loss(self, log_pi):
        """Calculates the loss for the entropy temperature parameter. This is only relevant if self.automatic_entropy_tuning
        is True."""
        alpha_loss = -(self.log_alpha *
                       (log_pi + self.target_entropy).detach()).mean()
        return alpha_loss

    def update_critic_parameters(self, critic_loss_1, critic_loss_2):
        """Updates the parameters for both critics"""
        self.take_optimisation_step(
            self.critic_optimizer, self.critic_local, critic_loss_1,
            self.hyperparameters["Critic"]["gradient_clipping_norm"])
        self.take_optimisation_step(
            self.critic_optimizer_2, self.critic_local_2, critic_loss_2,
            self.hyperparameters["Critic"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(
            self.critic_local, self.critic_target,
            self.hyperparameters["Critic"]["tau"])
        self.soft_update_of_target_network(
            self.critic_local_2, self.critic_target_2,
            self.hyperparameters["Critic"]["tau"])

    def update_actor_parameters(self, actor_loss, alpha_loss):
        """Updates the parameters for the actor and (if specified) the temperature parameter"""
        self.take_optimisation_step(
            self.actor_optimizer, self.actor_local, actor_loss,
            self.hyperparameters["Actor"]["gradient_clipping_norm"])
        if alpha_loss is not None:
            self.take_optimisation_step(self.alpha_optim, None, alpha_loss,
                                        None)
            self.alpha = self.log_alpha.exp()

    def print_summary_of_latest_evaluation_episode(self):
        """Prints a summary of the latest episode"""
        print(" ")
        print("----------------------------")
        print("Episode score {} ".format(self.total_episode_score_so_far))
        print("----------------------------")

    def locally_save_policy(self):
        """Saves the policy"""
        save_dict = {
            'critic_local': self.critic_local.state_dict(),
            'critic_local_2': self.critic_local_2.state_dict(),
            'critic_target': self.critic_target.state_dict(),
            'critic_target_2': self.critic_target_2.state_dict(),
            'actor_local': self.actor_local.state_dict(),
        }
        save_path = os.path.join(
            self.hyperparameters['save_path'],
            self.agent_name + '_' + str(self.episode_number) + '.pt')
        torch.save(save_dict, save_path)

    def save_model(self):
        self.locally_save_policy()

    def load_model(self, model_path=None):
        if model_path is None:
            model_path = os.path.join(self.hyperparameters['save_path'],
                                      self.agent_name + '.pt')
        if os.path.exists(model_path):
            if torch.cuda.is_available():
                save_dict = torch.load(model_path)
            else:
                save_dict = torch.load(model_path,
                                       map_location=torch.device('cpu'))
            self.critic_local.load_state_dict(save_dict['critic_local'])
            self.critic_local_2.load_state_dict(save_dict['critic_local_2'])
            self.critic_target.load_state_dict(save_dict['critic_target'])
            self.critic_target_2.load_state_dict(save_dict['critic_target_2'])
            self.actor_local.load_state_dict(save_dict['actor_local'])
            print('load model from', os.path.exists(model_path))
        else:
            print('No File:', model_path)
class DDPG(Base_Agent):
    """A DDPG Agent"""
    agent_name = "DDPG"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.actor_target = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Actor")
        Base_Agent.copy_model_over(self.actor_local, self.actor_target)

        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = OU_Noise_Exploration(self.config)

    def step(self):
        """Runs a step in the game"""
        self.episode_step_number_val = 0
        while not self.done:
            self.episode_step_number_val += 1
            # print("State ", self.state.shape)
            self.action = self.pick_action()
            self.conduct_action(self.action)
            if self.time_for_critic_and_actor_to_learn():
                for _ in range(self.hyperparameters[
                        "learning_updates_per_learning_session"]):
                    states, actions, rewards, next_states, dones = self.sample_experiences(
                    )
                    self.critic_learn(states, actions, rewards, next_states,
                                      dones)
                    self.actor_learn(states)
            self.save_experience()
            self.state = self.next_state  #this is to set the state for the next iteration
            self.global_step_number += 1
        self.episode_number += 1

    def sample_experiences(self):
        return self.memory.sample()

    def pick_action(self, state=None):
        """Picks an action using the actor network and then adds some noise to it to ensure exploration"""
        if state is None:
            state = torch.from_numpy(self.state).float().unsqueeze(0).to(
                self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        action = self.exploration_strategy.perturb_action_for_exploration_purposes(
            {
                "action": action,
                "device": self.device
            })
        return action.squeeze(0)

    def pick_best_action(self, state=None):
        """Picks an action using the actor network without noise"""
        if state is None:
            state = torch.from_numpy(self.state).float().unsqueeze(0).to(
                self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        return action.squeeze(0)

    def critic_learn(self, states, actions, rewards, next_states, dones):
        """Runs a learning iteration for the critic"""
        loss = self.compute_loss(states, next_states, rewards, actions, dones)
        self.take_optimisation_step(
            self.critic_optimizer, self.critic_local, loss,
            self.hyperparameters["Critic"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(
            self.critic_local, self.critic_target,
            self.hyperparameters["Critic"]["tau"])

    def compute_loss(self, states, next_states, rewards, actions, dones):
        """Computes the loss for the critic"""
        with torch.no_grad():
            critic_targets = self.compute_critic_targets(
                next_states, rewards, dones)
        critic_expected = self.compute_expected_critic_values(states, actions)
        loss = functional.mse_loss(critic_expected, critic_targets)
        return loss

    def compute_critic_targets(self, next_states, rewards, dones):
        """Computes the critic target values to be used in the loss for the critic"""
        critic_targets_next = self.compute_critic_values_for_next_states(
            next_states)
        critic_targets = self.compute_critic_values_for_current_states(
            rewards, critic_targets_next, dones)
        return critic_targets

    def compute_critic_values_for_next_states(self, next_states):
        """Computes the critic values for next states to be used in the loss for the critic"""
        with torch.no_grad():
            actions_next = self.actor_target(next_states)
            critic_targets_next = self.critic_target(
                torch.cat((next_states, actions_next), 1))
        return critic_targets_next

    def compute_critic_values_for_current_states(self, rewards,
                                                 critic_targets_next, dones):
        """Computes the critic values for current states to be used in the loss for the critic"""
        critic_targets_current = rewards + (
            self.hyperparameters["discount_rate"] * critic_targets_next *
            (1.0 - dones))
        return critic_targets_current

    def compute_expected_critic_values(self, states, actions):
        """Computes the expected critic values to be used in the loss for the critic"""
        critic_expected = self.critic_local(torch.cat((states, actions), 1))
        return critic_expected

    def time_for_critic_and_actor_to_learn(self):
        """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
        actor and critic"""
        return self.enough_experiences_to_learn_from(
        ) and self.global_step_number % self.hyperparameters[
            "update_every_n_steps"] == 0

    def actor_learn(self, states):
        """Runs a learning iteration for the actor"""
        if self.done:  #we only update the learning rate at end of each episode
            self.update_learning_rate(
                self.hyperparameters["Actor"]["learning_rate"],
                self.actor_optimizer)
        actor_loss = self.calculate_actor_loss(states)
        self.take_optimisation_step(
            self.actor_optimizer, self.actor_local, actor_loss,
            self.hyperparameters["Actor"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(
            self.actor_local, self.actor_target,
            self.hyperparameters["Actor"]["tau"])

    def calculate_actor_loss(self, states):
        """Calculates the loss for the actor"""
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(torch.cat(
            (states, actions_pred), 1)).mean()
        return actor_loss

    def locally_save_policy(self):
        """Saves the policy"""
        save_dict = {
            'critic_local': self.critic_local.state_dict(),
            'critic_target': self.critic_target.state_dict(),
            'actor_local': self.actor_local.state_dict(),
            'actor_target': self.actor_target.state_dict(),
        }
        save_path = os.path.join(
            self.hyperparameters['save_path'],
            self.agent_name + '_' + str(self.episode_number) + '.pt')
        torch.save(save_dict, save_path)

    def save_model(self):
        self.locally_save_policy()

    def load_model(self, model_path=None):
        if model_path is None:
            model_path = os.path.join(self.hyperparameters['save_path'],
                                      self.agent_name + '.pt')
        if os.path.exists(model_path):
            save_dict = torch.load(model_path)
            self.critic_local.load_state_dict(save_dict['critic_local'])
            self.critic_target.load_state_dict(save_dict['critic_target'])
            self.actor_local.load_state_dict(save_dict['actor_local'])
            self.actor_target.load_state_dict(save_dict['actor_target'])
            print('load model from', os.path.exists(model_path))
        else:
            print('No File:', model_path)
class DDPG(Base_Agent):
    """A DDPG Agent"""
    agent_name = "DDPG"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"], self.config.seed)
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size,
                                          key_to_use="Actor")
        self.actor_target = self.create_NN(input_dim=self.state_size,
                                           output_dim=self.action_size,
                                           key_to_use="Actor")
        Base_Agent.copy_model_over(self.actor_local, self.actor_target)

        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.exploration_strategy = OU_Noise_Exploration(self.config)

        if self.video_mode:
            self.file_name = self.environment_title + "_" + self.agent_name + "_videos"
            for i in range(config.num_episodes_to_run):
                pathset = os.path.join(self.file_name)
                if not (os.path.exists(pathset)):
                    os.mkdir(pathset)
            # f = tables.open_file(self.file_name, mode = 'w')
            # f.close()
            # datainfo = "DDPG_"+ self.environment_title + "_info.txt"
            # f = open(self.file_name, 'w')
            # f.close()
            # f = open(datainfo, 'w')
            # f.write(str(self.height))
            # f.write(str(self.width))
            # f.write(str(self.channel))
            # f.write(str(config.max_step))
            # f.write(str(config.num_episodes_to_run))
            # f.close()
        self.save_max_result_list_list = []

    def step(self):
        """Runs a step in the game"""
        # print("(DDPG) into the step")

        # if self.video_mode:
        #     f = open(self.file_name, mode = 'a')
        # f = open(self.file_name, 'a')
        # f.write("Episode" + str(self.episode_number) +"\n")
        # self.f = tables.open_file(self.file_name, mode='a')
        # self.atom = tables.Int64Atom()
        # self.array_c = self.f.create_earray(self.f.root, "Episode"+str(self.episode_number), self.atom, (0,self.height, self.width, self.channel))

        record_video = self.video_mode and self.config.num_episodes_to_run - 10 <= self.episode_number
        if record_video:
            render_list = []
        save_max_score_list = []
        while not self.done:
            # print("State ", self.state.shape)
            self.action = self.pick_action()
            # print("picked action")
            """This is for the Cart-Pole environment"""
            if (self.get_environment_title() == "CartPole"):
                go_action = np.argmax(self.action)
                self.action = np.zeros(2)
                # print(self.action)

                self.action[go_action] = 1
                # self.action = np.put(self.action, go_action, 1)
                # print(self.action)
                self.conduct_action(go_action)

            else:
                self.conduct_action(self.action)
                # print("(DDPG) action conducted! Rendering...")
                img = self.environment.render('rgb_array')
                if record_video:
                    # f = open(self.file_name, mode='wb')
                    render_list.append(img)
                    # img = np.reshape(img, (1)).tolist()
                    # f.write(str(img))
                    # f.write('\n')
                    # img = np.reshape(img, (1, img.shape[0], img.shape[1], img.shape[2]))
                    # print(type(img))
                    # print(img.shape)
                    # print(self.array_c.shape)
                    # print(img)
                    # line = '\n'
                    # f.write(img.tostring())
                    # f.write(line.encode("utf-8"))
                    # f.close()
                    # self.array_c.append(img)
                # self.render.append(img)
                save_max_score_list.append(img)

            # print("(DDPG)outside the loop")
            # print(self.time_for_critic_and_actor_to_learn())
            # This is the learning part
            if self.time_for_critic_and_actor_to_learn():
                # print("(DDPG) It is time to learn!!")
                for _ in range(self.hyperparameters[
                        "learning_updates_per_learning_session"]):
                    states, actions, rewards, next_states, dones = self.sample_experiences(
                    )
                    # print("(DDPG) running in range")
                    self.critic_learn(states, actions, rewards, next_states,
                                      dones)
                    self.actor_learn(states)
                    # print("(DDPG)running in range complete")

            # print("(DDPG) outside of critic loop")
            self.save_experience()
            # print("(DDPG) saving experience")
            ######################
            self.state = self.next_state  #this is to set the state for the next iteration
            self.global_step_number += 1
            # print("(DDPG) incrementing step number")

        if record_video:
            render_list = np.array(render_list)
            np.save(self.file_name + '/episode' + str(self.episode_number + 1),
                    render_list)

        if self.total_episode_score_so_far > -0.2:
            if len(self.save_max_result_list_list) == 10:
                self.save_max_result_list_list.pop(0)
            self.save_max_result_list_list.append(save_max_score_list)

        if self.config.num_episodes_to_run == self.episode_number + 1:
            i = 1
            for save_max_score_list in self.save_max_result_list_list:
                save_max_score_list = np.array(save_max_score_list)
                np.save(self.file_name + '/maxscore' + str(i),
                        save_max_score_list)
                i += 1

        self.episode_number += 1
        # print("The epsiode end! rendering!!")
        # self.environment.render()

    def sample_experiences(self):
        return self.memory.sample()

    def pick_action(self, state=None):
        """Picks an action using the actor network and then adds some noise to it to ensure exploration"""
        if state is None:
            state = torch.from_numpy(self.state).float().unsqueeze(0).to(
                self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        action = self.exploration_strategy.perturb_action_for_exploration_purposes(
            {"action": action})
        return action.squeeze(0)

    def critic_learn(self, states, actions, rewards, next_states, dones):
        """Runs a learning iteration for the critic"""
        # print("(DDPG) inside the critic_learn()")
        loss = self.compute_loss(states, next_states, rewards, actions, dones)
        # print("(DDPG) critic learn loss: ",loss)
        self.take_optimisation_step(
            self.critic_optimizer, self.critic_local, loss,
            self.hyperparameters["Critic"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(
            self.critic_local, self.critic_target,
            self.hyperparameters["Critic"]["tau"])

    def compute_loss(self, states, next_states, rewards, actions, dones):
        """Computes the loss for the critic"""
        # print("(DDPG) inside the compute_loss()")
        with torch.no_grad():
            critic_targets = self.compute_critic_targets(
                next_states, rewards, dones)
        # print("(DDPG) after torch.no_grad")
        critic_expected = self.compute_expected_critic_values(states, actions)
        loss = functional.mse_loss(critic_expected, critic_targets)
        return loss

    def compute_critic_targets(self, next_states, rewards, dones):
        """Computes the critic target values to be used in the loss for the critic"""
        # print("(DDPG) inside the compute_critic_targets")
        critic_targets_next = self.compute_critic_values_for_next_states(
            next_states)
        critic_targets = self.compute_critic_values_for_current_states(
            rewards, critic_targets_next, dones)
        return critic_targets

    def compute_critic_values_for_next_states(self, next_states):
        """Computes the critic values for next states to be used in the loss for the critic"""
        # print("(DDPG) inside the compute_critic_values_for_next_states")
        with torch.no_grad():
            # print("(DDPG) comput_critic_values_for_next_states()) inside the torch.no_grad()")
            # input()
            # print(self.actor_target)
            # print(next_states)
            # print(self.actor_target(next_states))
            actions_next = self.actor_target(next_states)
            # input()
            # print("(DDPG comput_critic_values_for_next_states()) after calculating actor_target")
            critic_targets_next = self.critic_target(
                torch.cat((next_states, actions_next), 1))
        # print("(DDPG compute_critic_values_for_next_states()) after torch.no_grad")
        return critic_targets_next

    def compute_critic_values_for_current_states(self, rewards,
                                                 critic_targets_next, dones):
        """Computes the critic values for current states to be used in the loss for the critic"""
        critic_targets_current = rewards + (
            self.hyperparameters["discount_rate"] * critic_targets_next *
            (1.0 - dones))
        return critic_targets_current

    def compute_expected_critic_values(self, states, actions):
        """Computes the expected critic values to be used in the loss for the critic"""
        critic_expected = self.critic_local(torch.cat((states, actions), 1))
        return critic_expected

    def time_for_critic_and_actor_to_learn(self):
        """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
        actor and critic"""
        return self.enough_experiences_to_learn_from(
        ) and self.global_step_number % self.hyperparameters[
            "update_every_n_steps"] == 0

    def actor_learn(self, states):
        """Runs a learning iteration for the actor"""
        if self.done:  #we only update the learning rate at end of each episode
            self.update_learning_rate(
                self.hyperparameters["Actor"]["learning_rate"],
                self.actor_optimizer)
        actor_loss = self.calculate_actor_loss(states)
        self.take_optimisation_step(
            self.actor_optimizer, self.actor_local, actor_loss,
            self.hyperparameters["Actor"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(
            self.actor_local, self.actor_target,
            self.hyperparameters["Actor"]["tau"])

    def calculate_actor_loss(self, states):
        """Calculates the loss for the actor"""
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(torch.cat(
            (states, actions_pred), 1)).mean()
        return actor_loss
class DQN(Base_Agent):
    """A deep Q learning agent"""
    agent_name = "DQN"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        self.memory = Replay_Buffer(self.hyperparameters["buffer_size"],
                                    self.hyperparameters["batch_size"],
                                    config.seed)
        self.q_network_local = self.create_NN(input_dim=self.state_size,
                                              output_dim=self.action_size)
        self.q_network_optimizer = optim.SGD(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"],
            weight_decay=5e-4)
        self.exploration_strategy = Epsilon_Greedy_Exploration(config)

    def reset_game(self):
        super(DQN, self).reset_game()
        self.update_learning_rate(self.hyperparameters["learning_rate"],
                                  self.q_network_optimizer)

    def step(self):
        """Runs a step within a game including a learning step if required"""
        while not self.done:
            # print('state:', self.state)
            # self.environment.render()
            self.action = self.pick_action()
            self.conduct_action(self.action)
            if self.time_for_q_network_to_learn():
                for _ in range(self.hyperparameters["learning_iterations"]):
                    try:
                        self.environment.pause()
                        # print('pause')
                        self.learn()
                        self.environment.resume()
                        # print('resume')
                    except:
                        self.learn()
            self.save_experience()
            self.state = self.next_state  #this is to set the state for the next iteration
            self.global_step_number += 1
        self.episode_number += 1

    def pick_action(self, state=None):
        """Uses the local Q network and an epsilon greedy policy to pick an action"""
        # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add
        # a "fake" dimension to make it a mini-batch rather than a single observation
        if state is None: state = self.state
        if isinstance(state, np.int64) or isinstance(state, int):
            state = np.array([state])
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        if len(state.shape) < 2: state = state.unsqueeze(0)
        self.q_network_local.eval()  #puts network in evaluation mode
        with torch.no_grad():
            action_values = self.q_network_local(state)
        self.q_network_local.train()  #puts network back in training mode

        force_explore = self.config.force_explore_mode and self.need_to_force_explore(
        )

        if force_explore:
            print('explore...')

        action = self.exploration_strategy.perturb_action_for_exploration_purposes(
            {
                "action_values": action_values,
                "turn_off_exploration": self.turn_off_exploration,
                "episode_number": self.episode_number,
                "force_explore": force_explore
            })
        # self.logger.info("Q values {} -- Action chosen {}".format(action_values, action))
        return action

    def learn(self, experiences=None):
        """Runs a learning iteration for the Q network"""
        if experiences is None:
            states, actions, rewards, next_states, dones = self.sample_experiences(
            )  #Sample experiences
        else:
            states, actions, rewards, next_states, dones = experiences
        loss = self.compute_loss(states, next_states, rewards, actions, dones)

        actions_list = [action_X.item() for action_X in actions]

        self.logger.info("Action counts {}".format(Counter(actions_list)))
        self.take_optimisation_step(
            self.q_network_optimizer, self.q_network_local, loss,
            self.hyperparameters["gradient_clipping_norm"])

    def compute_loss(self, states, next_states, rewards, actions, dones):
        """Computes the loss required to train the Q network"""
        with torch.no_grad():
            Q_targets = self.compute_q_targets(next_states, rewards, dones)
        Q_expected = self.compute_expected_q_values(states, actions)
        # loss = F.mse_loss(Q_expected, Q_targets)

        loss = nn.MSELoss(size_average=False)(Q_expected, Q_targets)
        return loss

    def compute_q_targets(self, next_states, rewards, dones):
        """Computes the q_targets we will compare to predicted q values to create the loss to train the Q network"""
        Q_targets_next = self.compute_q_values_for_next_states(next_states)
        Q_targets = self.compute_q_values_for_current_states(
            rewards, Q_targets_next, dones)
        return Q_targets

    def compute_q_values_for_next_states(self, next_states):
        """Computes the q_values for next state we will use to create the loss to train the Q network"""
        Q_targets_next = self.q_network_local(next_states).detach().max(
            1)[0].unsqueeze(1)
        return Q_targets_next

    def compute_q_values_for_current_states(self, rewards, Q_targets_next,
                                            dones):
        """Computes the q_values for current state we will use to create the loss to train the Q network"""
        Q_targets_current = rewards + (self.hyperparameters["discount_rate"] *
                                       Q_targets_next * (1 - dones))
        return Q_targets_current

    def compute_expected_q_values(self, states, actions):
        """Computes the expected q_values we will use to create the loss to train the Q network"""
        Q_expected = self.q_network_local(states).gather(1, actions.long(
        ))  #must convert actions to long so can be used as index
        return Q_expected

    def time_for_q_network_to_learn(self):
        """Returns boolean indicating whether enough steps have been taken for learning to begin and there are
        enough experiences in the replay buffer to learn from"""
        return self.right_amount_of_steps_taken(
        ) and self.enough_experiences_to_learn_from()

    def right_amount_of_steps_taken(self):
        """Returns boolean indicating whether enough steps have been taken for learning to begin"""
        return self.global_step_number % self.hyperparameters[
            "update_every_n_steps"] == 0

    def sample_experiences(self):
        """Draws a random sample of experience from the memory buffer"""
        experiences = self.memory.sample()
        states, actions, rewards, next_states, dones = experiences
        return states, actions, rewards, next_states, dones

    def locally_save_policy(self, best=True, episode=None):
        if self.agent_name != "DQN":
            state = {
                'episode': self.episode_number,
                'q_network_local': self.q_network_local.state_dict(),
                'q_network_target': self.q_network_target.state_dict()
            }
        else:
            state = {
                'episode': self.episode_number,
                'q_network_local': self.q_network_local.state_dict()
            }

        model_root = os.path.join('Models', self.config.env_title,
                                  self.agent_name, self.config.log_base)
        if not os.path.exists(model_root):
            os.makedirs(model_root)

        if best:
            last_best_file = glob.glob(
                os.path.join(model_root, 'rolling_score*'))
            if last_best_file:
                os.remove(last_best_file[0])

            save_name = model_root + "/rolling_score_%.4f.model" % (
                self.rolling_results[-1])
            torch.save(state, save_name)
            self.logger.info('Model-%s save success...' % (save_name))
        else:
            save_name = model_root + "/%s_%d.model" % (self.agent_name,
                                                       self.episode_number)
            torch.save(state, save_name)
            self.logger.info('Model-%s save success...' % (save_name))

    def load_resume(self, resume_path):
        save = torch.load(resume_path)
        if self.agent_name != "DQN":
            q_network_local_dict = save['q_network_local']
            q_network_target_dict = save['q_network_target']
            self.q_network_local.load_state_dict(q_network_local_dict,
                                                 strict=True)
            self.q_network_target.load_state_dict(q_network_target_dict,
                                                  strict=True)
        else:
            q_network_local_dict = save['q_network_local']
            self.q_network_local.load_state_dict(q_network_local_dict,
                                                 strict=True)
        self.logger.info('load resume model success...')

        file_name = os.path.basename(resume_path)
        episode_str = re.findall(r"\d+\.?\d*", file_name)[0]
        episode_list = episode_str.split('.')
        if not episode_list[1]:
            episode = episode_list[0]
        else:
            episode = 0

        if not self.config.retrain:
            self.episode_number = episode
        else:
            self.episode_number = 0
Beispiel #7
0
class DDQN_Wrapper(Base_Agent):
    def __init__(self,
                 config,
                 global_action_id_to_primitive_actions,
                 action_length_reward_bonus,
                 end_of_episode_symbol="/"):
        super().__init__(config)
        self.end_of_episode_symbol = end_of_episode_symbol
        self.global_action_id_to_primitive_actions = global_action_id_to_primitive_actions
        self.memory = Replay_Buffer(self.hyperparameters["buffer_size"],
                                    self.hyperparameters["batch_size"],
                                    config.seed)
        self.exploration_strategy = Epsilon_Greedy_Exploration(config)

        self.oracle = self.create_oracle()
        self.oracle_optimizer = optim.Adam(
            self.oracle.parameters(), lr=self.hyperparameters["learning_rate"])

        self.q_network_local = self.create_NN(input_dim=self.state_size + 1,
                                              output_dim=self.action_size)
        self.q_network_local.print_model_summary()
        self.q_network_optimizer = optim.Adam(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"])
        self.q_network_target = self.create_NN(input_dim=self.state_size + 1,
                                               output_dim=self.action_size)
        Base_Agent.copy_model_over(from_model=self.q_network_local,
                                   to_model=self.q_network_target)

        self.action_length_reward_bonus = action_length_reward_bonus
        self.abandon_ship = config.hyperparameters["abandon_ship"]

    def create_oracle(self):
        """Creates the network we will use to predict the next state"""
        oracle_hyperparameters = copy.deepcopy(self.hyperparameters)
        oracle_hyperparameters["columns_of_data_to_be_embedded"] = []
        oracle_hyperparameters["embedding_dimensions"] = []
        oracle_hyperparameters["linear_hidden_units"] = [5, 5]
        oracle_hyperparameters["final_layer_activation"] = [None, "tanh"]
        oracle = self.create_NN(input_dim=self.state_size + 2,
                                output_dim=[self.state_size + 1, 1],
                                hyperparameters=oracle_hyperparameters)
        oracle.print_model_summary()
        return oracle

    def run_n_episodes(self, num_episodes,
                       episodes_to_run_with_no_exploration):
        self.turn_on_any_epsilon_greedy_exploration()
        self.round_of_macro_actions = []
        self.episode_actions_scores_and_exploration_status = []
        num_episodes_to_get_to = self.episode_number + num_episodes
        while self.episode_number < num_episodes_to_get_to:
            self.reset_game()
            self.step()
            self.save_and_print_result()
            if num_episodes_to_get_to - self.episode_number == episodes_to_run_with_no_exploration:
                self.turn_off_any_epsilon_greedy_exploration()
        assert len(self.episode_actions_scores_and_exploration_status
                   ) == num_episodes, "{} vs. {}".format(
                       len(self.episode_actions_scores_and_exploration_status),
                       num_episodes)
        assert len(self.episode_actions_scores_and_exploration_status[0]) == 3
        assert self.episode_actions_scores_and_exploration_status[0][2] in [
            True, False
        ]
        assert isinstance(
            self.episode_actions_scores_and_exploration_status[0][1], list)
        assert isinstance(
            self.episode_actions_scores_and_exploration_status[0][1][0], int)
        assert isinstance(
            self.episode_actions_scores_and_exploration_status[0][0],
            int) or isinstance(
                self.episode_actions_scores_and_exploration_status[0][0],
                float)
        return self.episode_actions_scores_and_exploration_status, self.round_of_macro_actions

    def step(self):
        """Runs a step within a game including a learning step if required"""
        step_number = 0.0
        self.state = np.append(
            self.state, step_number /
            200.0)  #Divide by 200 because there are 200 steps in cart pole

        self.total_episode_score_so_far = 0
        episode_macro_actions = []
        while not self.done:
            surprised = False
            macro_action = self.pick_action()
            primitive_actions = self.global_action_id_to_primitive_actions[
                macro_action]
            primitive_actions_conducted = 0
            for ix, action in enumerate(primitive_actions):
                if self.abandon_ship and primitive_actions_conducted > 0:
                    if self.abandon_macro_action(action):
                        break

                step_number += 1
                self.action = action
                self.next_state, self.reward, self.done, _ = self.environment.step(
                    action)
                self.next_state = np.append(
                    self.next_state, step_number / 200.0
                )  #Divide by 200 because there are 200 steps in cart pole

                self.total_episode_score_so_far += self.reward
                if self.hyperparameters["clip_rewards"]:
                    self.reward = max(min(self.reward, 1.0), -1.0)
                primitive_actions_conducted += 1
                self.track_episodes_data()
                self.save_experience()

                if len(primitive_actions) > 1:

                    surprised = self.am_i_surprised()

                self.state = self.next_state
                if self.time_for_q_network_to_learn():
                    for _ in range(
                            self.hyperparameters["learning_iterations"]):
                        self.q_network_learn()
                        self.oracle_learn()
                if self.done or surprised: break
            episode_macro_actions.append(macro_action)
            self.round_of_macro_actions.append(macro_action)
        if random.random() < 0.1: print(Counter(episode_macro_actions))
        self.save_episode_actions_with_score()
        self.episode_number += 1
        self.logger.info("END OF EPISODE")

    def am_i_surprised(self):
        """Returns boolean indicating whether the next_state was a surprise or not"""
        with torch.no_grad():
            state = torch.from_numpy(self.state).float().unsqueeze(0).to(
                self.device)
            action = torch.Tensor([[self.action]])

            states_and_actions = torch.cat(
                (state, action),
                dim=1)  #must change this for all games besides cart pole
            predictions = self.oracle(states_and_actions)
            predicted_next_state = predictions[0, :-1]

            difference = F.mse_loss(predicted_next_state,
                                    torch.Tensor(self.next_state))
            if difference > 0.5:
                print("Surprise! Loss {} -- {} vs. {}".format(
                    difference, predicted_next_state, self.next_state))
                return True
            else:
                return False

    def abandon_macro_action(self, action):
        """Returns boolean indicating whether to abandon macro action or not"""
        state = torch.from_numpy(self.state).float().unsqueeze(0).to(
            self.device)
        with torch.no_grad():
            primitive_q_values = self.calculate_q_values(
                state, local=True, primitive_actions_only=True)
        q_value_highest = torch.max(primitive_q_values)
        q_values_action = primitive_q_values[:, action]
        if q_value_highest > 0.0: multiplier = 0.7
        else: multiplier = 1.3
        if q_values_action < multiplier * q_value_highest:
            print("BREAKING Action {} -- Q Values {}".format(
                action, primitive_q_values))
            return True
        else:
            return False

    def pick_action(self, state=None):
        """Uses the local Q network and an epsilon greedy policy to pick an action"""
        if state is None: state = self.state
        if isinstance(state, np.int64) or isinstance(state, int):
            state = np.array([state])
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        if len(state.shape) < 2: state = state.unsqueeze(0)
        self.q_network_local.eval()  #puts network in evaluation mode
        with torch.no_grad():
            action_values = self.calculate_q_values(
                state, local=True, primitive_actions_only=False)
        self.q_network_local.train()  #puts network back in training mode
        action = self.exploration_strategy.perturb_action_for_exploration_purposes(
            {
                "action_values": action_values,
                "turn_off_exploration": self.turn_off_exploration,
                "episode_number": self.episode_number
            })
        self.logger.info("Q values {} -- Action chosen {}".format(
            action_values, action))
        return action

    def calculate_q_values(self, states, local, primitive_actions_only):
        """Calculates the q values using the local q network"""
        if local:
            primitive_q_values = self.q_network_local(states)
        else:
            primitive_q_values = self.q_network_target(states)

        num_actions = len(self.global_action_id_to_primitive_actions)
        if primitive_actions_only or num_actions <= self.action_size:
            return primitive_q_values

        extra_q_values = self.calculate_macro_action_q_values(
            states, num_actions)
        extra_q_values = torch.Tensor([extra_q_values])
        all_q_values = torch.cat((primitive_q_values, extra_q_values), dim=1)

        return all_q_values

    def calculate_macro_action_q_values(self, state, num_actions):
        assert state.shape[0] == 1
        q_values = []
        for action_id in range(self.action_size, num_actions):
            macro_action = self.global_action_id_to_primitive_actions[
                action_id]
            predicted_next_state = state
            cumulated_reward = 0
            action_ix = 0
            for action in macro_action[:-1]:
                predictions = self.oracle(
                    torch.cat((predicted_next_state, torch.Tensor([[action]])),
                              dim=1))
                rewards = predictions[:, -1]
                predicted_next_state = predictions[:, :-1]
                cumulated_reward += (
                    rewards.item() + self.action_length_reward_bonus
                ) * self.hyperparameters["discount_rate"]**(action_ix)
                action_ix += 1
            final_action = macro_action[-1]
            final_q_value = self.q_network_local(predicted_next_state)[
                0, final_action]
            total_q_value = cumulated_reward + final_q_value * self.hyperparameters[
                "discount_rate"]**(action_ix)
            q_values.append(total_q_value)
        return q_values

    def time_for_q_network_to_learn(self):
        """Returns boolean indicating whether enough steps have been taken for learning to begin and there are
        enough experiences in the replay buffer to learn from"""
        return self.right_amount_of_steps_taken(
        ) and self.enough_experiences_to_learn_from()

    def right_amount_of_steps_taken(self):
        """Returns boolean indicating whether enough steps have been taken for learning to begin"""
        return self.global_step_number % self.hyperparameters[
            "update_every_n_steps"] == 0

    def q_network_learn(self, experiences=None):
        """Runs a learning iteration for the Q network"""
        if experiences is None:
            states, actions, rewards, next_states, dones = self.sample_experiences(
            )  #Sample experiences
        else:
            states, actions, rewards, next_states, dones = experiences
        loss = self.compute_loss(states, next_states, rewards, actions, dones)
        self.take_optimisation_step(
            self.q_network_optimizer, self.q_network_local, loss,
            self.hyperparameters["gradient_clipping_norm"])
        self.soft_update_of_target_network(self.q_network_local,
                                           self.q_network_target,
                                           self.hyperparameters["tau"])

    def sample_experiences(self):
        """Draws a random sample of experience from the memory buffer"""
        experiences = self.memory.sample()
        states, actions, rewards, next_states, dones = experiences
        return states, actions, rewards, next_states, dones

    def compute_loss(self, states, next_states, rewards, actions, dones):
        """Computes the loss required to train the Q network"""
        with torch.no_grad():
            max_action_indexes = self.calculate_q_values(
                next_states, local=True,
                primitive_actions_only=True).detach().argmax(1)
            Q_targets_next = self.calculate_q_values(
                next_states, local=False, primitive_actions_only=True).gather(
                    1, max_action_indexes.unsqueeze(1))
            Q_targets = rewards + (self.hyperparameters["discount_rate"] *
                                   Q_targets_next * (1 - dones))
        Q_expected = self.calculate_q_values(
            states, local=True,
            primitive_actions_only=True).gather(1, actions.long(
            ))  # must convert actions to long so can be used as index
        loss = F.mse_loss(Q_expected, Q_targets)
        return loss

    def save_episode_actions_with_score(self):

        self.episode_actions_scores_and_exploration_status.append([
            self.total_episode_score_so_far,
            self.episode_actions + [self.end_of_episode_symbol],
            self.turn_off_exploration
        ])

    def oracle_learn(self):
        states, actions, rewards, next_states, _ = self.sample_experiences(
        )  # Sample experiences
        states_and_actions = torch.cat(
            (states, actions),
            dim=1)  #must change this for all games besides cart pole
        predictions = self.oracle(states_and_actions)
        loss = F.mse_loss(torch.cat((next_states, rewards), dim=1),
                          predictions) / float(next_states.shape[1] + 1.0)
        self.take_optimisation_step(
            self.oracle_optimizer, self.oracle, loss,
            self.hyperparameters["gradient_clipping_norm"])
        self.logger.info("Oracle Loss {}".format(loss))
Beispiel #8
0
Datei: SAC.py Projekt: ai4ce/SNAC
class SAC(Base_Agent):
    """Soft Actor-Critic model based on the 2018 paper https://arxiv.org/abs/1812.05905 and on this github implementation
      https://github.com/pranz24/pytorch-soft-actor-critic. It is an actor-critic algorithm where the agent is also trained
      to maximise the entropy of their actions as well as their cumulative reward"""
    agent_name = "SAC"

    def __init__(self, config):
        Base_Agent.__init__(self, config)
        assert self.action_types == "CONTINUOUS", "Action types must be continuous. Use SAC Discrete instead for discrete actions"
        assert self.config.hyperparameters["Actor"][
            "final_layer_activation"] != "Softmax", "Final actor layer must not be softmax"
        self.hyperparameters = config.hyperparameters
        self.critic_local = self.create_NN(input_dim=self.state_size +
                                           self.action_size,
                                           output_dim=1,
                                           key_to_use="Critic")
        self.critic_local_2 = self.create_NN(
            input_dim=self.state_size + self.action_size,
            output_dim=1,
            key_to_use="Critic",
            override_seed=self.config.seed + 1)
        self.critic_optimizer = torch.optim.Adam(
            self.critic_local.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.critic_optimizer_2 = torch.optim.Adam(
            self.critic_local_2.parameters(),
            lr=self.hyperparameters["Critic"]["learning_rate"],
            eps=1e-4)
        self.critic_target = self.create_NN(input_dim=self.state_size +
                                            self.action_size,
                                            output_dim=1,
                                            key_to_use="Critic")
        self.critic_target_2 = self.create_NN(input_dim=self.state_size +
                                              self.action_size,
                                              output_dim=1,
                                              key_to_use="Critic")
        Base_Agent.copy_model_over(self.critic_local, self.critic_target)
        Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
        self.memory = Replay_Buffer(
            self.hyperparameters["Critic"]["buffer_size"],
            self.hyperparameters["batch_size"],
            self.config.seed,
            device=self.device)
        self.actor_local = self.create_NN(input_dim=self.state_size,
                                          output_dim=self.action_size * 2,
                                          key_to_use="Actor")
        self.actor_optimizer = torch.optim.Adam(
            self.actor_local.parameters(),
            lr=self.hyperparameters["Actor"]["learning_rate"],
            eps=1e-4)
        self.automatic_entropy_tuning = self.hyperparameters[
            "automatically_tune_entropy_hyperparameter"]
        if self.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.environment.action_space.shape).to(
                    self.device)).item()  # heuristic value from the paper
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optim = Adam(
                [self.log_alpha],
                lr=self.hyperparameters["Actor"]["learning_rate"],
                eps=1e-4)
        else:
            self.alpha = self.hyperparameters["entropy_term_weight"]

        self.add_extra_noise = self.hyperparameters["add_extra_noise"]
        if self.add_extra_noise:
            self.noise = OU_Noise(self.action_size, self.config.seed,
                                  self.hyperparameters["mu"],
                                  self.hyperparameters["theta"],
                                  self.hyperparameters["sigma"])

        self.do_evaluation_iterations = self.hyperparameters[
            "do_evaluation_iterations"]

    def save_result(self):
        """Saves the result of an episode of the game. Overriding the method in Base Agent that does this because we only
        want to keep track of the results during the evaluation episodes"""
        if self.episode_number == 1 or not self.do_evaluation_iterations:
            self.game_full_episode_scores.extend(
                [self.total_episode_score_so_far])
            self.rolling_results.append(
                np.mean(
                    self.game_full_episode_scores[-1 *
                                                  self.rolling_score_window:]))
            self.save_max_result_seen()

        elif (self.episode_number -
              1) % TRAINING_EPISODES_PER_EVAL_EPISODE == 0:
            self.game_full_episode_scores.extend([
                self.total_episode_score_so_far
                for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE)
            ])
            self.rolling_results.extend([
                np.mean(
                    self.game_full_episode_scores[-1 *
                                                  self.rolling_score_window:])
                for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE)
            ])
            self.save_max_result_seen()

    def reset_game(self):
        """Resets the game information so we are ready to play a new episode"""
        Base_Agent.reset_game(self)
        if self.add_extra_noise: self.noise.reset()

    def step(self, isEval=False):
        if not isEval:
            """Runs an episode on the game, saving the experience and running a learning step if appropriate"""
            eval_ep = self.episode_number % TRAINING_EPISODES_PER_EVAL_EPISODE == 0 and self.do_evaluation_iterations
            self.episode_step_number_val = 0
            while not self.done:
                self.episode_step_number_val += 1
                self.action = self.pick_action(eval_ep)

                self.conduct_action(self.action)
                if self.time_for_critic_and_actor_to_learn():
                    for _ in range(self.hyperparameters[
                            "learning_updates_per_learning_session"]):
                        self.learn()
                mask = False if self.episode_step_number_val >= 1000. else self.done
                if not eval_ep:
                    self.save_experience(experience=(self.state, self.action,
                                                     self.reward,
                                                     self.next_state, mask))
                self.state = self.next_state
                self.global_step_number += 1
            print(self.total_episode_score_so_far)
            if eval_ep: self.print_summary_of_latest_evaluation_episode()
            if eval_ep:
                iou_test = self.environment.iou()
                # iou_test=self.iou(self.environment.environment_memory,np.argmax(self.environment.one_hot)+1)
                print('\nEpodise: ', self.episode_number, '| Ep_reward_test:',
                      self.reward)
                print('\nEpodise: ', self.episode_number, '| Ep_IOU_test: ',
                      iou_test)
                self.reward_history_test.append(self.reward)
                self.iou_history_test.append(iou_test)
            self.episode_number += 1
        else:
            eval_ep = True
            '''
            ########################3d dynamic test################################          
            print("Testing")

            plan_choose = self.environment.plan_choose
            print(f"PLAN = {plan_choose}")
            iou_all_average = 0
            iou_all_min = 1
            for test_set in range(10):
                env = deep_mobile_printing_3d1r(plan_choose=plan_choose, test_set=test_set)


                def iou(environment_memory,environment_plan,HALF_WINDOW_SIZE,plan_height,plan_width):
                    component1=environment_plan[HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_height,\
                                    HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_width].astype(bool)
                    component2=environment_memory[HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_height,\
                                    HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_width].astype(bool)
                    overlap = component1*component2 # Logical AND
                    union = component1 + component2 # Logical OR
                    IOU = overlap.sum()/float(union.sum())
                    return IOU

                print(test_set)
                N_iteration_test = 200
                best_iou = 0
                iou_test_total = 0
                iou_min = 1
                reward_test_total = 0
                start_time_test = time.time()

                fig = plt.figure(figsize=[10, 5])
                ax1 = fig.add_subplot(1, 2, 1, projection='3d')
                ax2 = fig.add_subplot(1, 2, 2)

                for ep in range(N_iteration_test):
                    obs = env.reset()
                    reward_test = 0
                    self.state = obs
                    while True:
                        self.action = self.pick_action(eval_ep)
                        self.conduct_action(self.action)
                        obs, r, done = env.step(self.action)
                        self.state = obs
                        reward_test += r
                        if done:
                            break

                    iou_test = iou(env.environment_memory,env.plan,env.HALF_WINDOW_SIZE,env.plan_height,env.plan_width)
                    iou_min = min(iou_min, iou_test)

                    if iou_test > best_iou:
                        best_iou = iou_test
                        best_plan = env.plan
                        best_step = env.count_step
                        best_brick = env.count_brick
                        best_tb = env.total_brick
                        best_env = env.environment_memory
                        env.render(ax1, ax2)
                        save_path = "plots/"
                        plt.savefig(save_path+"SAC_Plan"+str(test_set)+'_'+str(self.environment.plan_choose)+'_good.png')

                    iou_test_total += iou_test
                    reward_test_total += reward_test

                reward_test_total = reward_test_total / N_iteration_test
                iou_test_total = iou_test_total / N_iteration_test
                secs = int(time.time() - start_time_test)
                mins = secs // 60
                secs = secs % 60
                print(f"time = {mins} min {secs} sec")
                print(f"iou = {iou_test_total}")
                print(f"reward_test = {reward_test_total}")
                if best_iou>0:
                    env.render(ax1,ax2,iou_average=iou_test_total,iou_min=iou_min,iter_times=N_iteration_test,best_env=best_env,best_iou=best_iou,best_step=best_step,best_brick=best_brick)
                else:
                    env.render(ax1,ax2,iou_average=iou_test_total,iou_min=iou_min,iter_times=N_iteration_test)
                save_path = "plots/"
                plt.savefig(save_path+"SAC_Plan"+str(test_set)+'_'+str(self.environment.plan_choose)+'_summary.png')

                iou_all_average += iou_test_total
                iou_all_min = min(iou_min,iou_all_min)

            iou_all_average = iou_all_average/10
            print('iou_all_average',iou_all_average)
            print('iou_all_min',iou_all_min)
            '''
            '''
            ########################3d static test################################
            print(f"Testing plan {self.environment.plan_choose}")

            env = deep_mobile_printing_3d1r(plan_choose=self.environment.plan_choose)


            def iou(environment_memory,environment_plan,HALF_WINDOW_SIZE,plan_height,plan_width):
                component1=environment_plan[HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_height,\
                                HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_width].astype(bool)
                component2=environment_memory[HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_height,\
                                HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_width].astype(bool)
                overlap = component1*component2 # Logical AND
                union = component1 + component2 # Logical OR
                IOU = overlap.sum()/float(union.sum())
                return IOU

            N_iteration_test = 500 
            best_iou = 0
            iou_test_total = 0
            iou_min = 1
            reward_test_total = 0
            start_time_test = time.time()

            fig = plt.figure(figsize=[10, 5])
            ax1 = fig.add_subplot(1, 2, 1, projection='3d')
            ax2 = fig.add_subplot(1, 2, 2)

            for ep in range(N_iteration_test):
                obs = env.reset()
                reward_test = 0
                self.state = obs
                
                while True:

                    self.action = self.pick_action(eval_ep)
                    self.conduct_action(self.action)
                    # action, _ = test_agent.predict(obs)
                    obs, r, done = env.step(self.action)
                    self.state = obs

                    # action, _ = test_agent.predict(obs)
                    # obs, r, done, info = env.step(action)
                    reward_test += r
                    if done:
                        break

                iou_test = iou(env.environment_memory,env.plan,env.HALF_WINDOW_SIZE,env.plan_height,env.plan_width)
                iou_min = min(iou_min, iou_test)

                if iou_test > best_iou:
                    best_iou = iou_test
                    best_plan = env.plan
                    best_step = env.count_step
                    best_brick = env.count_brick
                    best_tb = env.total_brick
                    best_env = env.environment_memory
                    env.render(ax1, ax2)
                    save_path = "plots/"
                    plt.savefig(save_path+"SAC_Plan"+'_'+str(self.environment.plan_choose)+'_good.png')
                iou_test_total += iou_test
                reward_test_total += reward_test

            reward_test_total = reward_test_total / N_iteration_test
            iou_test_total = iou_test_total / N_iteration_test
            secs = int(time.time() - start_time_test)
            mins = secs // 60
            secs = secs % 60
            print(f"time = {mins} min {secs} sec")
            print(f"iou = {iou_test_total}")
            print(f"reward_test = {reward_test_total}")

            if best_iou>0:
                env.render(ax1,ax2,iou_average=iou_test_total,iou_min=iou_min,iter_times=N_iteration_test,best_env=best_env,best_iou=best_iou,best_step=best_step,best_brick=best_brick)
            else:
                env.render(ax1,ax2,iou_average=iou_test_total,iou_min=iou_min,iter_times=N_iteration_test)
            save_path = "plots/"
            plt.savefig(save_path+"SAC_Plan"+'_'+str(self.environment.plan_choose)+'_summary.png')
            '''

            # ########################2d dynamic test################################
            # print("Testing")

            # print(f"PLAN = {self.environment.plan_choose}")
            # iou_all_average = 0
            # iou_all_min = 1
            # for test_set in range(10):
            #     test_set = 6
            #     env = deep_mobile_printing_2d1r(plan_choose=self.environment.plan_choose, test_set=test_set)

            #     def iou(environment_memory,environment_plan,HALF_WINDOW_SIZE,plan_height,plan_width):
            #         component1=environment_plan[HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_height,\
            #                             HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_width].astype(bool)
            #         component2=environment_memory[HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_height,\
            #                             HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_width].astype(bool)
            #         overlap = component1*component2 # Logical AND
            #         union = component1 + component2 # Logical OR
            #         IOU = overlap.sum()/float(union.sum())
            #         return IOU

            #     print(test_set)
            #     N_iteration_test = 200
            #     best_iou = 0
            #     iou_test_total = 0
            #     iou_min = 1
            #     reward_test_total = 0
            #     start_time_test = time.time()

            #     fig = plt.figure(figsize=(5, 5))
            #     ax = fig.add_subplot(1, 1, 1)
            #     for ep in range(N_iteration_test):
            #         obs = env.reset()
            #         reward_test = 0
            #         self.state = obs
            #         while True:
            #             self.action = self.pick_action(eval_ep)
            #             self.conduct_action(self.action)
            #             # action, _ = test_agent.predict(obs)
            #             obs, r, done, info = env.step(self.action)
            #             self.state = obs
            #             reward_test += r
            #             if done:
            #                 break

            #         iou_test = iou(env.environment_memory,env.plan,env.HALF_WINDOW_SIZE,env.plan_height,env.plan_width)
            #         iou_min = min(iou_min, iou_test)

            #         if iou_test > best_iou:
            #             best_iou = iou_test
            #             best_plan = env.plan
            #             best_tb = env.total_brick
            #             env.render(ax)
            #             save_path = "plots/"
            #             plt.savefig(save_path+"SAC_Plan"+str(test_set)+'_'+str(self.environment.plan_choose)+'_good.png')
            #         iou_test_total += iou_test
            #         reward_test_total += reward_test

            #     reward_test_total = reward_test_total / N_iteration_test
            #     iou_test_total = iou_test_total / N_iteration_test
            #     secs = int(time.time() - start_time_test)
            #     mins = secs // 60
            #     secs = secs % 60
            #     print(f"time = {mins} min {secs} sec")
            #     print(f"iou = {iou_test_total}")
            #     print(f"reward_test = {reward_test_total}")
            #     env.render(ax,iou_average=iou_test_total,iou_min=iou_min,iter_times=N_iteration_test)
            #     iou_all_average += iou_test_total
            #     iou_all_min = min(iou_min,iou_all_min)
            #     save_path = "plots/"
            #     plt.savefig(save_path+"SAC_Plan"+str(test_set)+'_'+str(self.environment.plan_choose)+'_summary.png')

            # iou_all_average = iou_all_average/10
            # print('iou_all_average',iou_all_average)
            # print('iou_all_min',iou_all_min)
            '''
            ########################2d static test################################
            def iou(environment_memory,environment_plan,HALF_WINDOW_SIZE,plan_height,plan_width):
                component1=environment_plan[HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_height,\
                                HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_width].astype(bool)
                component2=environment_memory[HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_height,\
                                HALF_WINDOW_SIZE:HALF_WINDOW_SIZE+plan_width].astype(bool)
                overlap = component1*component2 # Logical AND
                union = component1 + component2 # Logical OR
                IOU = overlap.sum()/float(union.sum())
                return IOU

            env = deep_mobile_printing_2d1r(plan_choose=self.environment.plan_choose)

            N_iteration_test = 500
            best_iou = 0
            iou_test_total = 0
            iou_min = 1
            reward_test_total = 0
            start_time_test = time.time()

            fig = plt.figure(figsize=(5, 5))
            ax = fig.add_subplot(1, 1, 1)
            for ep in range(N_iteration_test):
                obs = env.reset()
                reward_test = 0
                self.state = obs
                while True:
                    self.action = self.pick_action(eval_ep)
                    self.conduct_action(self.action)
                    obs, r, done = env.step(self.action)
                    self.state = obs

                    # action, _ = test_agent.predict(obs)
                    # obs, r, done, info = env.step(action)
                    reward_test += r
                    if done:
                        break

                iou_test = iou(env.environment_memory,env.plan,env.HALF_WINDOW_SIZE,env.plan_height,env.plan_width)
                iou_min = min(iou_min, iou_test)

                if iou_test > best_iou:
                    best_iou = iou_test
                    best_plan = env.plan
                    best_tb = env.total_brick
                    env.render(ax)
                    save_path = "plots/"
                    plt.savefig(save_path+"SAC_Plan"+str(self.environment.plan_choose)+'_good.png')
                iou_test_total += iou_test
                reward_test_total += reward_test

            reward_test_total = reward_test_total / N_iteration_test
            iou_test_total = iou_test_total / N_iteration_test
            secs = int(time.time() - start_time_test)
            mins = secs // 60
            secs = secs % 60
            print(f"time = {mins} min {secs} sec")
            print(f"iou = {iou_test_total}")
            print(f"reward_test = {reward_test_total}")
            env.render(ax,iou_average=iou_test_total,iou_min=iou_min,iter_times=N_iteration_test)

            save_path = "plots/"
            plt.savefig(save_path+"SAC_Plan"+str(self.environment.plan_choose)+'_summary.png')
            '''
            '''
            ########################1d dynamic test################################

            iou_all_average = 0
            iou_all_min = 1
            for plan_choose in range(10):
                plan_choose = 8
                print(plan_choose)
                env = deep_mobile_printing_1d1r(plan_choose=plan_choose)
                N_iteration_test = 200
                best_iou = 0
                iou_test_total = 0
                iou_min = 1
                reward_test_total = 0
                start_time_test = time.time()

                fig = plt.figure(figsize=(5, 5))
                ax = fig.add_subplot(1, 1, 1)
                for ep in range(N_iteration_test):
                    obs = env.reset()
                    reward_test = 0
                    self.state = obs
                    while True:
                        self.action = self.pick_action(eval_ep)
                        self.conduct_action(self.action)
                        obs, r, done  = env.step(self.action)
                        self.state = obs
                        reward_test += r
                        if done:
                            break

                    iou_test = env.iou()
                    iou_min = min(iou_min, iou_test)

                    if iou_test > best_iou:
                        best_iou = iou_test
                        best_plan = env.plan
                        best_tb = env.total_brick
                        env.render(ax)
                        save_path = "plots/"
                        plt.savefig(save_path+"SAC_Plan"+str(plan_choose)+'_good.png')
                    iou_test_total += iou_test
                    reward_test_total += reward_test

                reward_test_total = reward_test_total / N_iteration_test
                iou_test_total = iou_test_total / N_iteration_test
                secs = int(time.time() - start_time_test)
                mins = secs // 60
                secs = secs % 60
                print(f"time = {mins} min {secs} sec")
                print(f"iou = {iou_test_total}")
                print(f"reward_test = {reward_test_total}")
                env.render(ax,iou_average=iou_test_total,iou_min=iou_min,iter_times=N_iteration_test)
                iou_all_average += iou_test_total
                iou_all_min = min(iou_min,iou_all_min)
                save_path = "plots/"
                plt.savefig(save_path+"SAC_Plan"+str(plan_choose)+'_summary.png')

            iou_all_average = iou_all_average/10
            print('iou_all_average',iou_all_average)
            print('iou_all_min',iou_all_min)
            '''

            ########################1d static test################################
            plan = self.environment.plan_choose

            print(f"Testing plan {plan}")

            N_iteration_test = 1
            best_iou = 0
            iou_test_total = 0
            iou_min = 1
            reward_test_total = 0
            start_time_test = time.time()

            fig = plt.figure(figsize=(5, 5))
            ax = fig.add_subplot(1, 1, 1)
            for ep in range(N_iteration_test):
                obs = self.environment.reset()
                reward_test = 0

                while True:
                    self.action = self.pick_action(eval_ep)
                    self.conduct_action(self.action)
                    self.state = self.next_state
                    obs, r, done = self.environment.step(self.action)
                    reward_test += r
                    if done:
                        break
                # self.environment.render(ax)
                # plt.show()
                iou_test = self.environment.iou()
                iou_min = min(iou_min, iou_test)

                if iou_test > best_iou:
                    best_iou = iou_test
                    best_plan = self.environment.plan
                    best_tb = self.environment.total_brick
                    self.environment.render(ax)
                    save_path = self.config.save_model_path
                    plt.savefig(save_path + "SAC_Plan" + str(plan) +
                                '_good.png')
                iou_test_total += iou_test
                reward_test_total += reward_test

            reward_test_total = reward_test_total / N_iteration_test
            iou_test_total = iou_test_total / N_iteration_test
            secs = int(time.time() - start_time_test)
            mins = secs // 60
            secs = secs % 60
            print(f"time = {mins} min {secs} sec")
            print(f"iou = {iou_test_total}")
            print(f"reward_test = {reward_test_total}")

            self.environment.render(ax,
                                    iou_average=iou_test_total,
                                    iou_min=iou_min,
                                    iter_times=N_iteration_test)

            save_path = self.config.save_model_path
            plt.savefig(save_path + "SAC_Plan" + str(plan) + '_summary.png')

            ########################Initial test################################
            # fig = plt.figure(figsize=(5, 5))

            # ax = fig.add_subplot(1, 1, 1)

            # print(self.environment.total_brick)

            # print(self.environment.one_hot)

            # step = self.environment.total_step

            # ax.clear()
            # while not self.done:

            #     self.action = self.pick_action(eval_ep)

            #     self.conduct_action(self.action)

            #     self.state = self.next_state

            #     self.environment.render(ax)

            #     self.environment.iou()

            #     plt.pause(1e-6)

            # plt.show()
            exit()

    def pick_action(self, eval_ep, state=None):
        """Picks an action using one of three methods: 1) Randomly if we haven't passed a certain number of steps,
         2) Using the actor in evaluation mode if eval_ep is True  3) Using the actor in training mode if eval_ep is False.
         The difference between evaluation and training mode is that training mode does more exploration"""
        if state is None: state = self.state
        # print(self.state)

        if eval_ep: action = self.actor_pick_action(state=state, eval=True)
        elif self.global_step_number < self.hyperparameters[
                "min_steps_before_learning"]:
            # print('pick_action')
            action = np.random.randint(0, self.environment.action_dim)
            #action = self.environment.action_space.sample()
            print("Picking random action ", action)
        else:
            action = self.actor_pick_action(state=state)
        if self.add_extra_noise:
            action += self.noise.sample()
        return action

    def actor_pick_action(self, state=None, eval=False):
        """Uses actor to pick an action in one of two ways: 1) If eval = False and we aren't in eval mode then it picks
        an action that has partly been randomly sampled 2) If eval = True then we pick the action that comes directly
        from the network and so did not involve any random sampling"""
        if state is None: state = self.state
        state = torch.FloatTensor([state]).to(self.device)
        if len(state.shape) == 1: state = state.unsqueeze(0)
        if eval == False:
            action, _, _ = self.produce_action_and_action_info(state)
        else:
            with torch.no_grad():
                _, z, action = self.produce_action_and_action_info(state)
        action = action.detach().cpu().numpy()
        return action[0]

    def produce_action_and_action_info(self, state):
        """Given the state, produces an action, the log probability of the action, and the tanh of the mean action"""
        actor_output = self.actor_local(state)
        mean, log_std = actor_output[:, :self.
                                     action_size], actor_output[:, self.
                                                                action_size:]
        std = log_std.exp()
        normal = Normal(mean, std)
        x_t = normal.rsample(
        )  #rsample means it is sampled using reparameterisation trick
        action = torch.tanh(x_t)
        log_prob = normal.log_prob(x_t)
        log_prob -= torch.log(1 - action.pow(2) + EPSILON)
        log_prob = log_prob.sum(1, keepdim=True)
        return action, log_prob, torch.tanh(mean)

    def time_for_critic_and_actor_to_learn(self):
        """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
        actor and critic"""
        return self.global_step_number > self.hyperparameters["min_steps_before_learning"] and \
               self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0

    def learn(self):
        """Runs a learning iteration for the actor, both critics and (if specified) the temperature parameter"""
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.sample_experiences(
        )
        qf1_loss, qf2_loss = self.calculate_critic_losses(
            state_batch, action_batch, reward_batch, next_state_batch,
            mask_batch)
        policy_loss, log_pi = self.calculate_actor_loss(state_batch)
        if self.automatic_entropy_tuning:
            alpha_loss = self.calculate_entropy_tuning_loss(log_pi)
        else:
            alpha_loss = None
        self.update_all_parameters(qf1_loss, qf2_loss, policy_loss, alpha_loss)

    def sample_experiences(self):
        return self.memory.sample()

    def calculate_critic_losses(self, state_batch, action_batch, reward_batch,
                                next_state_batch, mask_batch):
        """Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy
         term is taken into account"""
        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.produce_action_and_action_info(
                next_state_batch)
            qf1_next_target = self.critic_target(
                torch.cat((next_state_batch, next_state_action), 1))
            qf2_next_target = self.critic_target_2(
                torch.cat((next_state_batch, next_state_action), 1))
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + (
                1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (
                    min_qf_next_target)
        qf1 = self.critic_local(torch.cat((state_batch, action_batch), 1))
        qf2 = self.critic_local_2(torch.cat((state_batch, action_batch), 1))
        qf1_loss = F.mse_loss(qf1, next_q_value)
        qf2_loss = F.mse_loss(qf2, next_q_value)
        return qf1_loss, qf2_loss

    def calculate_actor_loss(self, state_batch):
        """Calculates the loss for the actor. This loss includes the additional entropy term"""
        action, log_pi, _ = self.produce_action_and_action_info(state_batch)
        qf1_pi = self.critic_local(torch.cat((state_batch, action), 1))
        qf2_pi = self.critic_local_2(torch.cat((state_batch, action), 1))
        min_qf_pi = torch.min(qf1_pi, qf2_pi)
        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean()
        return policy_loss, log_pi

    def calculate_entropy_tuning_loss(self, log_pi):
        """Calculates the loss for the entropy temperature parameter. This is only relevant if self.automatic_entropy_tuning
        is True."""
        alpha_loss = -(self.log_alpha *
                       (log_pi + self.target_entropy).detach()).mean()
        return alpha_loss

    def update_all_parameters(self, critic_loss_1, critic_loss_2, actor_loss,
                              alpha_loss):
        """Updates the parameters for the actor, both critics and (if specified) the temperature parameter"""
        self.take_optimisation_step(
            self.critic_optimizer, self.critic_local, critic_loss_1,
            self.hyperparameters["Critic"]["gradient_clipping_norm"])
        self.take_optimisation_step(
            self.critic_optimizer_2, self.critic_local_2, critic_loss_2,
            self.hyperparameters["Critic"]["gradient_clipping_norm"])
        self.take_optimisation_step(
            self.actor_optimizer, self.actor_local, actor_loss,
            self.hyperparameters["Actor"]["gradient_clipping_norm"])
        self.soft_update_of_target_network(
            self.critic_local, self.critic_target,
            self.hyperparameters["Critic"]["tau"])
        self.soft_update_of_target_network(
            self.critic_local_2, self.critic_target_2,
            self.hyperparameters["Critic"]["tau"])
        if alpha_loss is not None:
            self.take_optimisation_step(self.alpha_optim, None, alpha_loss,
                                        None)
            self.alpha = self.log_alpha.exp()

    def print_summary_of_latest_evaluation_episode(self):
        """Prints a summary of the latest episode"""
        print(" ")
        print("----------------------------")
        print("Episode score {} ".format(self.total_episode_score_so_far))
        print("----------------------------")