def test():
    env = gym.make(args.env_name)

    env.seed(10)
    torch.manual_seed(10)

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]

    actor = Actor(state_size, action_size, args)
    actor.load_state_dict(torch.load(args.load_model))

    for ep in range(args.test_iter):
        score = 0
        done = False
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        while not done:
            mu, std = actor(torch.Tensor(state))
            action = actor.get_action(mu, std)
            #random_action = env.action_space.sample()
            next_state, reward, done, info = env.step(mu.detach().numpy())
            env.render()

            score += reward
            next_state = np.reshape(next_state, [1, state_size])
            state = next_state
        if ep % args.log_interval == 0:
            print(ep, " ep | score ", score)
    env.close()
class TD3Agent:
    def __init__(self, env, render, config_info):
        self.env = env
        self._reset_env()
        self.render = render

        # Create run folder to store parameters, figures, and tensorboard logs
        self.path_runs = create_run_folder(config_info)

        # Extract training parameters from yaml config file
        param = load_training_parameters(config_info["config_param"])
        self.train_param = param["training"]

        # Define device
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print(f"Device in use : {self.device}")

        # Define state and action dimension spaces
        state_dim = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]
        self.max_action = float(env.action_space.high[0])

        # Define models
        hidden_size = param["model"]["hidden_size"]
        self.critic = Critic(state_dim, self.num_actions,
                             hidden_size).to(self.device)
        self.target_critic = Critic(state_dim, self.num_actions,
                                    hidden_size).to(self.device)
        self.target_critic.load_state_dict(self.critic.state_dict())

        self.policy = Actor(state_dim, self.num_actions, hidden_size,
                            self.max_action).to(self.device)
        self.target_policy = Actor(state_dim, self.num_actions, hidden_size,
                                   self.max_action).to(self.device)
        self.target_policy.load_state_dict(self.policy.state_dict())

        # Define loss criterion
        self.criterion = nn.MSELoss()

        # Define optimizers
        lr = float(param["optimizer"]["learning_rate"])
        self.critic_opt = optim.Adam(self.critic.parameters(), lr=lr)
        self.policy_opt = optim.Adam(self.policy.parameters(), lr=lr)

        # Initialize replay buffer
        self.replay_buffer = ReplayBuffer(param["training"]["replay_max_size"])

        self.transition = namedtuple(
            "transition",
            field_names=["state", "action", "reward", "done", "next_state"],
        )

        # Useful variables
        self.batch_size = param["training"]["batch_size"]
        self.gamma = param["training"]["gamma"]
        self.tau = param["training"]["tau"]
        self.start_step = param["training"]["start_step"]
        self.max_timesteps = param["training"]["max_timesteps"]
        self.noise_policy = param["training"]["noise_policy"]
        self.noise_clip = param["training"]["noise_clip"]
        self.noise_explor = param["training"]["noise_explor"]
        self.update_freq_policy = param["training"]["update_freq_policy"]
        self.eval_freq = param["training"]["eval_freq"]
        self.num_eval_episodes = param["training"]["num_eval_episodes"]

    def _reset_env(self):
        # Reset the environment and initialize episode reward
        self.state, self.done = self.env.reset(), False
        self.episode_reward = 0.0
        self.episode_step = 0

    def train(self):
        # Main training loop
        total_timestep = 0
        all_episode_rewards = []
        all_mean_rewards = []
        update = 0

        # Create tensorboard writer
        writer = SummaryWriter(log_dir=self.path_runs, comment="-td3")

        for episode in itertools.count(1, 1):
            self._reset_env()

            while not self.done:
                # trick to improve exploration at the start of training
                if self.start_step > total_timestep:
                    action = self.env.action_space.sample(
                    )  # Sample random action
                else:
                    policy_action = self.policy.get_action(
                        self.state, self.device)
                    add_noise_action = np.random.normal(
                        loc=0, scale=self.noise_explor, size=self.num_actions)
                    noisy_action = policy_action + add_noise_action
                    action = np.clip(noisy_action, -self.max_action,
                                     self.max_action)

                # Fill the replay buffer up with transitions
                if (len(self.replay_buffer) > self.batch_size
                        and total_timestep > self.start_step):

                    batch = self.replay_buffer.sample_buffer(self.batch_size)

                    # Update parameters of all the networks
                    self.train_on_batch(batch, update, writer)

                if self.render:
                    self.env.render()

                # Perform one step in the environment
                next_state, reward, self.done, _ = self.env.step(action)
                total_timestep += 1
                self.episode_step += 1
                self.episode_reward += reward

                # Create a tuple for the new transition
                new_transition = self.transition(self.state, action, reward,
                                                 self.done, next_state)

                # Append transition to the replay buffer
                self.replay_buffer.store_transition(new_transition)

                self.state = next_state

            if total_timestep > self.max_timesteps:
                break

            mean_reward = np.mean(all_episode_rewards[-50:])
            all_episode_rewards.append(self.episode_reward)
            all_mean_rewards.append(mean_reward)

            print("Episode n°{} ; total timestep [{}/{}] ; episode steps {} ; "
                  "reward {} ; mean reward {}".format(
                      episode,
                      total_timestep,
                      self.max_timesteps,
                      self.episode_step,
                      round(self.episode_reward, 2),
                      round(mean_reward, 2),
                  ))

            writer.add_scalar("reward", self.episode_reward, episode)
            writer.add_scalar("mean reward", mean_reward, episode)

            # Let's evaluate TD3
            if episode % self.eval_freq == 0:
                avg_eval_return = self.eval()
                writer.add_scalar("eval/reward", avg_eval_return, episode)

        ### END
        # Save networks' weights
        path_critic = os.path.join(self.path_runs, "critic.pth")
        path_actor = os.path.join(self.path_runs, "actor.pth")
        torch.save(self.critic.state_dict(), path_critic)
        torch.save(self.policy.state_dict(), path_actor)

        # Plot reward
        self.plot_reward(all_episode_rewards, all_mean_rewards)

        # Close all
        writer.close()
        self.env.close()

    def train_on_batch(self, batch_samples, update, writer):
        # Unpack batch_size of transitions randomly drawn from the replay buffer
        update += 1

        (
            state_batch,
            action_batch,
            reward_batch,
            done_int_batch,
            next_state_batch,
        ) = batch_samples

        # Transform np arrays into tensors and send them to device
        state_batch = torch.tensor(state_batch).to(self.device)
        next_state_batch = torch.tensor(next_state_batch).to(self.device)
        action_batch = torch.tensor(action_batch).to(self.device)
        reward_batch = torch.tensor(reward_batch).unsqueeze(1).to(self.device)
        done_int_batch = torch.tensor(done_int_batch).unsqueeze(1).to(
            self.device)

        ### Update Q
        with torch.no_grad():
            add_noise = torch.clamp(
                torch.randn_like(action_batch) * self.noise_policy,
                min=-self.noise_clip,
                max=self.noise_clip,
            )
            next_action = torch.clamp(
                self.target_policy(next_state_batch) + add_noise,
                min=-self.max_action,
                max=self.max_action,
            )

            target_q1, target_q2 = self.target_critic(next_state_batch,
                                                      next_action)
            target_q = torch.min(target_q1, target_q2)
            target_q = reward_batch + self.gamma * (1 -
                                                    done_int_batch) * target_q

        # Estimated state-action values
        q1, q2 = self.critic(state_batch, action_batch)
        q1_loss = self.criterion(q1, target_q)
        q2_loss = self.criterion(q2, target_q)

        # Losses and optimizers
        self.critic_opt.zero_grad()
        q1_loss.backward()
        self.critic_opt.step()

        self.critic_opt.zero_grad()
        q2_loss.backward()
        self.critic_opt.step()

        writer.add_scalar("loss/q1", q1_loss.item(), update)
        writer.add_scalar("loss/q2", q2_loss.item(), update)

        if update % self.update_freq_policy == 0:
            action = self.policy(state_batch)
            q1 = self.critic(state_batch, action)
            policy_loss = -q1.mean()

            self.policy_opt.zero_grad()
            policy_loss.backward()
            self.policy_opt.step()

            writer.add_scalar("loss/policy", policy_loss.item(), update)

            # update target networks
            soft_update(self.target_critic, self.critic, self.tau)
            soft_update(self.target_policy, self.policy, self.tau)

    def eval(self):
        # Runs policy for X episodes and returns average reward

        # eval_env.seed(seed + 100)
        avg_reward = 0.0
        for _ in range(self.num_eval_episodes):
            self._reset_env()
            while not self.done:
                action = self.policy.get_action(np.array(self.state))
                nex_state, reward, self.done, _ = self.env.step(action)
                avg_reward += reward
                self.state = nex_state

        avg_reward /= self.num_eval_episodes

        print(
            f"Evaluation over {self.num_eval_episodes} episodes: {avg_reward:.3f}"
        )
        return avg_reward

    def plot_reward(self, data, mean_data):
        plt.plot(data, label="reward")
        plt.plot(mean_data, label="mean reward")
        plt.xlabel("Episode")
        plt.ylabel("Reward")
        plt.title(
            f"Reward evolution for {self.env.unwrapped.spec.id} Gym environment"
        )
        plt.tight_layout()
        plt.legend()

        path_fig = os.path.join(self.path_runs, "figure.png")
        plt.savefig(path_fig)
        print(f"Figure saved to {path_fig}")

        plt.show()
Exemple #3
0
def main(seed_num):
    env = gym.make("Pendulum-v0")
    #env = RedEnv()
    env.seed(seed_num)
    torch.manual_seed(seed_num)

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    print()
    print("env(PPO) : ", args.env_name)
    print("state_size : ", state_size)
    print("action_size : ", action_size)
    print()
    actor = Actor(state_size, action_size, args)
    critic = Critic(state_size, args)

    actor_optimizer = optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=args.critic_lr)

    writer = SummaryWriter(args.logdir)

    recent_rewards = deque(maxlen=100)
    episodes = 0

    for iter in range(args.max_iter_num):
        trajectories = deque()
        steps = 0

        while steps < args.total_sample_size:
            done = False
            score = 0
            episodes += 1

            state = env.reset()  # start distribution
            state = np.reshape(state, [1, state_size])

            while not done:

                steps += 1

                env.render()

                mu, std = actor(torch.Tensor(state))
                action = actor.get_action(mu, std)
                next_state, reward, done, info = env.step(action)
                #print(reward)
                mask = 0 if done else 1
                trajectories.append((state, action, reward, mask))
                score += reward

                next_state = np.reshape(next_state, [1, state_size])
                state = next_state

                if done:
                    recent_rewards.append(score)
                    if iter % args.log_interval:
                        writer.add_scalar('log/ep_len', steps, episodes)

        actor.train(), critic.train()
        update(actor, critic, actor_optimizer, critic_optimizer, trajectories,
               state_size, action_size)

        writer.add_scalar('log/score', float(score), episodes)

        if iter % args.log_interval == 0:
            tmp = np.array(recent_rewards)
            mean_reward = tmp.mean()
            std_reward = tmp.std()
            min_reward = tmp.min()
            max_reward = tmp.max()
            print(
                '{} iter | {} episode | score_avg: {:.2f} | score_std: {:.2f} | score_min: {:.2f} | score_max: {:.2f}'
                .format(iter, episodes, mean_reward, std_reward, min_reward,
                        max_reward))

        if np.mean(recent_rewards) > args.goal_score:
            if not os.path.isdir(args.save_path):
                os.makedirs(args.save_path)

            ckpt_path_a = args.save_path + str(seed_num) + 'th_model_a.pth.tar'
            ckpt_path_c = args.save_path + str(seed_num) + 'th_model_c.pth.tar'
            torch.save(actor.state_dict(), ckpt_path_a)
            torch.save(critic.state_dict(), ckpt_path_c)
            print('Learning Terminated')
            break