def train(self, epochs): best_eval = -1e6 for epoch in range(epochs): s = self.env.reset() s = self.state_normalize(s) policy_loss, critic_loss, alpha_loss = 0, 0, 0 while True: self.env.render() a, _ = self.select_action(s) s_, r, done, _ = self.env.step(a) s_ = self.state_normalize(s_) self.memory.push(s, a, r, s_, done) self.total_step += 1 if len( self.memory ) > self.batch_size and self.total_step > self.warmup_step: policy_loss, critic_loss, alpha_loss = self.learn() s = s_ if done: break if (epoch + 1) % self.save_log_frequency == 0: self.writer.add_scalar('loss/critic_loss', critic_loss, self.total_step) self.writer.add_scalar('loss/policy_loss', policy_loss, self.total_step) self.writer.add_scalar('alpha', self.log_alpha.exp().item(), self.total_step) self.writer.add_scalar('loss/alpha_loss', alpha_loss, self.total_step) if (epoch + 1) % self.save_model_frequency == 0: save_model( self.critic, 'model/{}_model/critic_{}'.format(self.env_name, epoch)) save_model( self.actor, 'model/{}_model/actor_{}'.format(self.env_name, epoch)) ZFilter.save( self.state_normalize, 'model/{}_model/rs_{}'.format(self.env_name, epoch)) if (epoch + 1) % self.eval_frequency == 0: eval_r = self.evaluate() print('epoch', epoch, 'evaluate reward', eval_r) self.writer.add_scalar('reward', eval_r, self.total_step) if eval_r > best_eval: best_eval = eval_r save_model( self.critic, 'model/{}_model/best_critic'.format(self.env_name)) save_model( self.actor, 'model/{}_model/best_actor'.format(self.env_name)) ZFilter.save( self.state_normalize, 'model/{}_model/best_rs'.format(self.env_name))
def __init__(self, env_name, env, actor_lr=3e-4, critic_lr=3e-4, sample_size=2048, gamma=0.99, lam=0.95, is_test=False, save_model_frequency=200, eval_frequency=10): self.env_name = env_name self.env = env self.actor_lr = actor_lr self.critic_lr = critic_lr self.sample_size = sample_size self.gamma = gamma self.lam = lam self.save_model_frequency = save_model_frequency self.eval_frequency = eval_frequency self.total_step = 0 self.state_normalize = ZFilter(env.observation_space.shape[0]) self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') print('Train on device:', self.device) if not is_test: self.writer = SummaryWriter('./logs_epoch_update/A2C_{}'.format( self.env_name)) self.loss_fn = F.smooth_l1_loss self.trace = Trace() self.actor = GaussianActor( env.observation_space.shape[0], env.action_space.shape[0], action_scale=int(env.action_space.high[0])).to(self.device) self.actor_opt = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.critic = Critic(env.observation_space.shape[0]).to(self.device) self.critic_opt = optim.Adam(self.critic.parameters(), lr=self.critic_lr) print(self.actor) print(self.critic)
def __init__(self, env_name, env, actor_lr=3e-4, critic_lr=3e-4, alpha_lr=3e-4, gamma=0.99, batch_size=64, replay_memory_size=1e6, update_frequency=2, warmup_step=1e3, tau=0.005, alpha=None, is_test=False, save_model_frequency=200, eval_frequency=10, save_log_frequency=10): self.env_name = env_name self.env = env self.actor_lr = actor_lr self.critic_lr = critic_lr self.alpha_lr = alpha_lr self.gamma = gamma self.batch_size = batch_size self.replay_memory_size = replay_memory_size self.update_frequency = update_frequency self.warmup_step = warmup_step self.tau = tau self.save_model_frequency = save_model_frequency self.eval_frequency = eval_frequency self.save_log_frequency = save_log_frequency self.total_step = 0 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') print('Train on device:', self.device) if not is_test: self.writer = SummaryWriter('./logs/SAC_{}'.format(self.env_name)) self.loss_fn = F.mse_loss self.memory = Memory(int(replay_memory_size), batch_size) n_state, n_action = env.observation_space.shape[ 0], env.action_space.shape[0] self.state_normalize = ZFilter(n_state) if alpha is None: self.auto_tune_alpha = True self.target_entropy = -torch.prod( torch.Tensor(env.action_space.shape)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_opt = optim.Adam([self.log_alpha], lr=self.alpha_lr) print('Auto adjust alpha') else: self.auto_tune_alpha = False self.log_alpha = torch.log(torch.tensor( alpha, dtype=torch.float)).to(self.device) print('Fixed alpha') self.actor = SACGaussianActor( n_state, n_action, 256, action_scale=int(env.action_space.high[0])).to(self.device) self.actor_opt = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.critic = TwinCritic(n_state + n_action, 256).to(self.device) self.critic_opt = optim.Adam(self.critic.parameters(), lr=self.critic_lr) self.target_critic = TwinCritic(n_state + n_action, 256).to(self.device) update_model(self.target_critic, self.critic) print(self.actor) print(self.critic)
import gym from td3_algorithm import TD3 import sys sys.path.append('..') from common.utils import load_model, ZFilter env_names = ['HalfCheetah-v2', 'Hopper-v2', 'Ant-v2'] env_name = env_names[2] env = gym.make(env_name) td3 = TD3(env_name, env, is_test=True) load_model(td3.actor, 'model/{}_model/best_actor'.format(env_name)) load_model(td3.critic, 'model/{}_model/best_critic'.format(env_name)) td3.state_normalize = ZFilter.load('model/{}_model/best_rs'.format(env_name)) for _ in range(10): eval_r = td3.evaluate(1, is_render=True) print('evaluate reward', eval_r)
def train(self, epochs): best_eval = -1e6 for epoch in range(epochs): num_sample = 0 self.trace.clear() s = self.env.reset() s = self.state_normalize(s) while True: # self.env.render() a, log_prob = self.select_action(s) log_prob = torch.sum(log_prob, dim=1, keepdim=True) v = self.critic( torch.tensor(s, dtype=torch.float).unsqueeze(0).to( self.device)) s_, r, done, _ = self.env.step(a) s_ = self.state_normalize(s_) self.trace.push(s, a, log_prob.cpu().detach().numpy()[0], r, s_, not done, v) num_sample += 1 self.total_step += 1 s = s_ if done and num_sample >= self.sample_size: break if done: s = self.env.reset() s = self.state_normalize(s) policy_loss, critic_loss = self.learn() if (epoch + 1) % self.save_log_frequency == 0: self.writer.add_scalar('loss/critic_loss', critic_loss, self.total_step) self.writer.add_scalar('loss/policy_loss', policy_loss, self.total_step) if (epoch + 1) % self.save_model_frequency == 0: save_model( self.critic, 'model/{}_model/critic_{}'.format(self.env_name, epoch)) save_model( self.actor, 'model/{}_model/actor_{}'.format(self.env_name, epoch)) ZFilter.save( self.state_normalize, 'model/{}_model/rs_{}'.format(self.env_name, epoch)) if (epoch + 1) % self.eval_frequency == 0: eval_r = self.evaluate() print('epoch', epoch, 'evaluate reward', eval_r) self.writer.add_scalar('reward', eval_r, self.total_step) if eval_r > best_eval: best_eval = eval_r save_model( self.critic, 'model/{}_model/best_critic'.format(self.env_name)) save_model( self.actor, 'model/{}_model/best_actor'.format(self.env_name)) ZFilter.save( self.state_normalize, 'model/{}_model/best_rs'.format(self.env_name))
def __init__(self, env_name, env, actor_lr=3e-4, critic_lr=3e-4, sample_size=2048, batch_size=64, sample_reuse=1, train_iters=5, clip=0.2, gamma=0.99, lam=0.95, is_test=False, save_model_frequency=200, eval_frequency=5, save_log_frequency=1): self.env_name = env_name self.env = env self.actor_lr = actor_lr self.critic_lr = critic_lr self.sample_size = sample_size self.batch_size = batch_size self.sample_reuse = sample_reuse self.train_iters = train_iters self.clip = clip self.gamma = gamma self.lam = lam self.save_model_frequency = save_model_frequency self.eval_frequency = eval_frequency self.save_log_frequency = save_log_frequency self.total_step = 0 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') print('Train on device:', self.device) if not is_test: self.writer = SummaryWriter('./logs/PPO_{}'.format(self.env_name)) self.loss_fn = F.mse_loss n_state, n_action = env.observation_space.shape[ 0], env.action_space.shape[0] self.state_normalize = ZFilter(n_state) self.actor = GaussianActor(n_state, n_action, 128, action_scale=int(env.action_space.high[0]), weights_init_=orthogonal_weights_init_).to( self.device) self.critic = Critic(n_state, 128, orthogonal_weights_init_).to(self.device) # self.optimizer = optim.Adam([ # {'params': self.critic.parameters(), 'lr': self.critic_lr}, # {'params': self.actor.parameters(), 'lr': self.actor_lr} # ]) self.actor_opt = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.critic_opt = optim.Adam(self.critic.parameters(), lr=self.critic_lr) self.trace = Trace() print(self.actor) print(self.critic)