def train(self, epochs):
        best_eval = -1e6
        for epoch in range(epochs):
            s = self.env.reset()
            s = self.state_normalize(s)
            policy_loss, critic_loss, alpha_loss = 0, 0, 0
            while True:
                self.env.render()
                a, _ = self.select_action(s)
                s_, r, done, _ = self.env.step(a)
                s_ = self.state_normalize(s_)
                self.memory.push(s, a, r, s_, done)
                self.total_step += 1
                if len(
                        self.memory
                ) > self.batch_size and self.total_step > self.warmup_step:
                    policy_loss, critic_loss, alpha_loss = self.learn()

                s = s_
                if done:
                    break

            if (epoch + 1) % self.save_log_frequency == 0:
                self.writer.add_scalar('loss/critic_loss', critic_loss,
                                       self.total_step)
                self.writer.add_scalar('loss/policy_loss', policy_loss,
                                       self.total_step)
                self.writer.add_scalar('alpha',
                                       self.log_alpha.exp().item(),
                                       self.total_step)
                self.writer.add_scalar('loss/alpha_loss', alpha_loss,
                                       self.total_step)

            if (epoch + 1) % self.save_model_frequency == 0:
                save_model(
                    self.critic,
                    'model/{}_model/critic_{}'.format(self.env_name, epoch))
                save_model(
                    self.actor,
                    'model/{}_model/actor_{}'.format(self.env_name, epoch))
                ZFilter.save(
                    self.state_normalize,
                    'model/{}_model/rs_{}'.format(self.env_name, epoch))

            if (epoch + 1) % self.eval_frequency == 0:
                eval_r = self.evaluate()
                print('epoch', epoch, 'evaluate reward', eval_r)
                self.writer.add_scalar('reward', eval_r, self.total_step)
                if eval_r > best_eval:
                    best_eval = eval_r
                    save_model(
                        self.critic,
                        'model/{}_model/best_critic'.format(self.env_name))
                    save_model(
                        self.actor,
                        'model/{}_model/best_actor'.format(self.env_name))
                    ZFilter.save(
                        self.state_normalize,
                        'model/{}_model/best_rs'.format(self.env_name))
Example #2
0
    def __init__(self,
                 env_name,
                 env,
                 actor_lr=3e-4,
                 critic_lr=3e-4,
                 sample_size=2048,
                 gamma=0.99,
                 lam=0.95,
                 is_test=False,
                 save_model_frequency=200,
                 eval_frequency=10):
        self.env_name = env_name
        self.env = env
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.sample_size = sample_size
        self.gamma = gamma
        self.lam = lam
        self.save_model_frequency = save_model_frequency
        self.eval_frequency = eval_frequency

        self.total_step = 0
        self.state_normalize = ZFilter(env.observation_space.shape[0])
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        print('Train on device:', self.device)
        if not is_test:
            self.writer = SummaryWriter('./logs_epoch_update/A2C_{}'.format(
                self.env_name))
        self.loss_fn = F.smooth_l1_loss

        self.trace = Trace()
        self.actor = GaussianActor(
            env.observation_space.shape[0],
            env.action_space.shape[0],
            action_scale=int(env.action_space.high[0])).to(self.device)
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=self.actor_lr)
        self.critic = Critic(env.observation_space.shape[0]).to(self.device)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=self.critic_lr)
        print(self.actor)
        print(self.critic)
    def __init__(self,
                 env_name,
                 env,
                 actor_lr=3e-4,
                 critic_lr=3e-4,
                 alpha_lr=3e-4,
                 gamma=0.99,
                 batch_size=64,
                 replay_memory_size=1e6,
                 update_frequency=2,
                 warmup_step=1e3,
                 tau=0.005,
                 alpha=None,
                 is_test=False,
                 save_model_frequency=200,
                 eval_frequency=10,
                 save_log_frequency=10):
        self.env_name = env_name
        self.env = env
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.alpha_lr = alpha_lr
        self.gamma = gamma
        self.batch_size = batch_size
        self.replay_memory_size = replay_memory_size
        self.update_frequency = update_frequency
        self.warmup_step = warmup_step
        self.tau = tau
        self.save_model_frequency = save_model_frequency
        self.eval_frequency = eval_frequency
        self.save_log_frequency = save_log_frequency

        self.total_step = 0
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        print('Train on device:', self.device)
        if not is_test:
            self.writer = SummaryWriter('./logs/SAC_{}'.format(self.env_name))
        self.loss_fn = F.mse_loss
        self.memory = Memory(int(replay_memory_size), batch_size)

        n_state, n_action = env.observation_space.shape[
            0], env.action_space.shape[0]
        self.state_normalize = ZFilter(n_state)
        if alpha is None:
            self.auto_tune_alpha = True
            self.target_entropy = -torch.prod(
                torch.Tensor(env.action_space.shape)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_opt = optim.Adam([self.log_alpha], lr=self.alpha_lr)
            print('Auto adjust alpha')
        else:
            self.auto_tune_alpha = False
            self.log_alpha = torch.log(torch.tensor(
                alpha, dtype=torch.float)).to(self.device)
            print('Fixed alpha')

        self.actor = SACGaussianActor(
            n_state, n_action, 256,
            action_scale=int(env.action_space.high[0])).to(self.device)
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=self.actor_lr)
        self.critic = TwinCritic(n_state + n_action, 256).to(self.device)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=self.critic_lr)
        self.target_critic = TwinCritic(n_state + n_action,
                                        256).to(self.device)
        update_model(self.target_critic, self.critic)

        print(self.actor)
        print(self.critic)
import gym
from td3_algorithm import TD3

import sys
sys.path.append('..')
from common.utils import load_model, ZFilter

env_names = ['HalfCheetah-v2', 'Hopper-v2', 'Ant-v2']
env_name = env_names[2]
env = gym.make(env_name)
td3 = TD3(env_name, env, is_test=True)

load_model(td3.actor, 'model/{}_model/best_actor'.format(env_name))
load_model(td3.critic, 'model/{}_model/best_critic'.format(env_name))
td3.state_normalize = ZFilter.load('model/{}_model/best_rs'.format(env_name))

for _ in range(10):
    eval_r = td3.evaluate(1, is_render=True)
    print('evaluate reward', eval_r)
    def train(self, epochs):
        best_eval = -1e6
        for epoch in range(epochs):
            num_sample = 0
            self.trace.clear()
            s = self.env.reset()
            s = self.state_normalize(s)
            while True:
                # self.env.render()
                a, log_prob = self.select_action(s)
                log_prob = torch.sum(log_prob, dim=1, keepdim=True)
                v = self.critic(
                    torch.tensor(s, dtype=torch.float).unsqueeze(0).to(
                        self.device))
                s_, r, done, _ = self.env.step(a)
                s_ = self.state_normalize(s_)
                self.trace.push(s, a,
                                log_prob.cpu().detach().numpy()[0], r, s_,
                                not done, v)
                num_sample += 1
                self.total_step += 1
                s = s_
                if done and num_sample >= self.sample_size:
                    break
                if done:
                    s = self.env.reset()
                    s = self.state_normalize(s)

            policy_loss, critic_loss = self.learn()

            if (epoch + 1) % self.save_log_frequency == 0:
                self.writer.add_scalar('loss/critic_loss', critic_loss,
                                       self.total_step)
                self.writer.add_scalar('loss/policy_loss', policy_loss,
                                       self.total_step)

            if (epoch + 1) % self.save_model_frequency == 0:
                save_model(
                    self.critic,
                    'model/{}_model/critic_{}'.format(self.env_name, epoch))
                save_model(
                    self.actor,
                    'model/{}_model/actor_{}'.format(self.env_name, epoch))
                ZFilter.save(
                    self.state_normalize,
                    'model/{}_model/rs_{}'.format(self.env_name, epoch))

            if (epoch + 1) % self.eval_frequency == 0:
                eval_r = self.evaluate()
                print('epoch', epoch, 'evaluate reward', eval_r)
                self.writer.add_scalar('reward', eval_r, self.total_step)
                if eval_r > best_eval:
                    best_eval = eval_r
                    save_model(
                        self.critic,
                        'model/{}_model/best_critic'.format(self.env_name))
                    save_model(
                        self.actor,
                        'model/{}_model/best_actor'.format(self.env_name))
                    ZFilter.save(
                        self.state_normalize,
                        'model/{}_model/best_rs'.format(self.env_name))
    def __init__(self,
                 env_name,
                 env,
                 actor_lr=3e-4,
                 critic_lr=3e-4,
                 sample_size=2048,
                 batch_size=64,
                 sample_reuse=1,
                 train_iters=5,
                 clip=0.2,
                 gamma=0.99,
                 lam=0.95,
                 is_test=False,
                 save_model_frequency=200,
                 eval_frequency=5,
                 save_log_frequency=1):
        self.env_name = env_name
        self.env = env
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.sample_size = sample_size
        self.batch_size = batch_size
        self.sample_reuse = sample_reuse
        self.train_iters = train_iters
        self.clip = clip
        self.gamma = gamma
        self.lam = lam
        self.save_model_frequency = save_model_frequency
        self.eval_frequency = eval_frequency
        self.save_log_frequency = save_log_frequency

        self.total_step = 0
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        print('Train on device:', self.device)
        if not is_test:
            self.writer = SummaryWriter('./logs/PPO_{}'.format(self.env_name))
        self.loss_fn = F.mse_loss

        n_state, n_action = env.observation_space.shape[
            0], env.action_space.shape[0]
        self.state_normalize = ZFilter(n_state)
        self.actor = GaussianActor(n_state,
                                   n_action,
                                   128,
                                   action_scale=int(env.action_space.high[0]),
                                   weights_init_=orthogonal_weights_init_).to(
                                       self.device)
        self.critic = Critic(n_state, 128,
                             orthogonal_weights_init_).to(self.device)

        # self.optimizer = optim.Adam([
        #     {'params': self.critic.parameters(), 'lr': self.critic_lr},
        #     {'params': self.actor.parameters(), 'lr': self.actor_lr}
        # ])
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=self.actor_lr)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=self.critic_lr)
        self.trace = Trace()

        print(self.actor)
        print(self.critic)