Ejemplo n.º 1
0
 def objective(self, x):
     model = GMM()
     model.copy_model(self.initial_model)
     model.update_gaussians(np.asarray(x))
     accuracy, mean_return, mean_length = model.evaluate(self.env,
                                                         max_steps=600,
                                                         num_episodes=1)
     print("Accuracy:", accuracy, "mean_return:", mean_return)
     return -mean_return
Ejemplo n.º 2
0
class SAC_GMM_Agent(SAC_Agent):
    def __init__(self, model, window_size=32, *args, **kwargs):
        self.initial_model = model  # Initial model provided
        self.model = GMM()
        self.model.copy_model(self.initial_model)  # Model used for training
        self.window_size = window_size
        self.burn_in_steps = 1000
        super(SAC_GMM_Agent, self).__init__(*args, **kwargs)

    def get_action_space(self):
        if not hasattr(self, 'action_space'):
            priors_high = np.ones(self.model.priors.size)
            mu_high = np.ones(self.model.mu.size)
            action_high = np.concatenate((priors_high, mu_high), axis=-1)
            action_low = -action_high
            self.action_space = gym.spaces.Box(action_low, action_high)
        return self.action_space

    def update_gaussians(self, gmm_change):
        # change of priors range: [-0.1, 0.1]
        priors = gmm_change[:self.model.priors.size]
        priors = priors.reshape(self.model.priors.shape) * 0.1
        # change of mus range: [-0.01, 0.01]
        mu = gmm_change[self.model.priors.size:]
        mu = mu.reshape(self.model.mu.shape) * 0.01
        change_dict = {"mu": mu, "prior": priors}
        self.model.update_gaussians(change_dict)

    def evaluate(self, num_episodes=5, render=False):
        succesful_episodes, episodes_returns, episodes_lengths = 0, [], []
        for episode in range(1, num_episodes + 1):
            observation = self.env.reset()
            episode_return, episode_length, left_steps = 0, 0, self.env.max_episode_steps
            while left_steps > 0:
                self.model.copy_model(self.initial_model)
                gmm_change = self.get_action_from_observation(
                    observation, deterministic=True)
                self.update_gaussians(gmm_change)
                model_reward = 0
                for step in range(self.window_size):
                    vel = self.model.predict_velocity_from_observation(
                        observation)
                    observation, reward, done, info = self.env.step(vel)
                    model_reward += reward
                    episode_length += 1
                    left_steps -= 1
                    if render:
                        self.env.render()
                    if done or left_steps <= 0:
                        break
                episode_return += model_reward
                if done:
                    break
                if render:
                    self.env.render()
            if ("success" in info) and info['success']:
                succesful_episodes += 1
            episodes_returns.append(episode_return)
            episodes_lengths.append(episode_length)
        accuracy = succesful_episodes / num_episodes
        return accuracy, np.mean(episodes_returns), np.mean(episodes_lengths)

    def train_episode(self, episode, exploration_episodes, log, render):
        sac_steps = 0
        episode_return, episode_length, left_steps = 0, 0, self.env.max_episode_steps
        ep_critic_loss, ep_actor_loss, ep_alpha_loss = 0, 0, 0
        observation = self.env.reset()
        while left_steps > 0:
            self.model.copy_model(self.initial_model)
            if self.training_step < self.burn_in_steps:
                gmm_change = np.zeros(self.action_space.shape)
            else:
                gmm_change = self.get_action_from_observation(
                    observation, deterministic=False)
            self.update_gaussians(gmm_change)
            model_reward = 0
            curr_observation = observation
            for step in range(self.window_size):
                vel = self.model.predict_velocity_from_observation(
                    curr_observation)
                curr_observation, reward, done, info = self.env.step(vel)
                model_reward += reward
                episode_length += 1
                left_steps -= 1
                if render:
                    self.env.render()
                if done or left_steps <= 0:
                    break
            critic_loss, actor_loss, alpha_loss = self.update(
                observation, gmm_change, curr_observation, model_reward, done,
                log)
            observation = curr_observation
            episode_return += model_reward
            ep_critic_loss += critic_loss
            ep_actor_loss += actor_loss
            ep_alpha_loss += alpha_loss
            self.training_step += 1  # SAC_Steps in total
            sac_steps += 1  # SAC_Steps in this episode

            if render:
                self.env.render()
            if done:
                break

        if log:
            self.log_scalar('Train/Episode/critic_loss',
                            ep_critic_loss / sac_steps, episode)
            self.log_scalar('Train/Episode/actor_loss',
                            ep_actor_loss / sac_steps, episode)
            self.log_scalar('Train/Episode/alpha_loss',
                            ep_alpha_loss / sac_steps, episode)
            self.log_episode_information(episode_return, episode_length,
                                         episode, "Train")

        return episode_return, episode_length