def __init__(self, buffer, env, act_fn, device, learning_starts=1000, update_period=1): """Init.""" self.env = ensure_vec_env(env) if self.env.num_envs > 1 and not isinstance(buffer, BatchedReplayBuffer): raise ValueError( "when num_envs > 1, you must pass a BatchedReplayBuffer" " to the ReplayBufferDataManager.") if not isinstance(buffer, BatchedReplayBuffer): buffer = BatchedReplayBuffer(buffer) if self.env.num_envs != buffer.n: raise ValueError( f"Found {self.env.num_envs} envs and {buffer.n} " "buffers. The number of envs must be equal to the " "number of buffers!") self.act = act_fn self.buffer = buffer self.device = device self.learning_starts = learning_starts self.update_period = update_period self._ob = None
def __init__(self, logdir, env_fn, policy_fn, qf_fn, nenv=1, optimizer=torch.optim.Adam, buffer_size=10000, frame_stack=1, learning_starts=1000, update_period=1, batch_size=256, policy_lr=1e-3, qf_lr=1e-3, gamma=0.99, target_update_period=1, policy_update_period=1, target_smoothing_coef=0.005, alpha=0.2, automatic_entropy_tuning=True, target_entropy=None, gpu=True, eval_num_episodes=1, record_num_episodes=1, log_period=1000): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.gamma = gamma self.buffer_size = buffer_size self.frame_stack = frame_stack self.learning_starts = learning_starts self.update_period = update_period self.batch_size = batch_size if target_update_period < self.update_period: self.target_update_period = self.update_period else: self.target_update_period = target_update_period - ( target_update_period % self.update_period) if policy_update_period < self.update_period: self.policy_update_period = self.update_period else: self.policy_update_period = policy_update_period - ( policy_update_period % self.update_period) self.target_smoothing_coef = target_smoothing_coef self.log_period = log_period self.device = torch.device( 'cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.env = VecEpisodeLogger(env_fn(nenv=nenv)) eval_env = VecFrameStack(self.env, self.frame_stack) self.pi = policy_fn(eval_env) self.qf1 = qf_fn(eval_env) self.qf2 = qf_fn(eval_env) self.target_qf1 = qf_fn(eval_env) self.target_qf2 = qf_fn(eval_env) self.pi.to(self.device) self.qf1.to(self.device) self.qf2.to(self.device) self.target_qf1.to(self.device) self.target_qf2.to(self.device) self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr) self.opt_qf1 = optimizer(self.qf1.parameters(), lr=qf_lr) self.opt_qf2 = optimizer(self.qf2.parameters(), lr=qf_lr) self.target_qf1.load_state_dict(self.qf1.state_dict()) self.target_qf2.load_state_dict(self.qf2.state_dict()) self.buffer = BatchedReplayBuffer( * [ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv)]) self.data_manager = ReplayBufferDataManager(self.buffer, self.env, SACActor(self.pi), self.device, self.learning_starts, self.update_period) self.alpha = alpha self.automatic_entropy_tuning = automatic_entropy_tuning if self.automatic_entropy_tuning: if target_entropy: self.target_entropy = target_entropy else: target_entropies = nest.map_structure( lambda space: -np.prod(space.shape).item(), misc.unpack_space(self.env.action_space)) self.target_entropy = sum(nest.flatten(target_entropies)) self.log_alpha = torch.tensor(np.log([self.alpha]), requires_grad=True, device=self.device, dtype=torch.float32) self.opt_alpha = optimizer([self.log_alpha], lr=policy_lr) else: self.target_entropy = None self.log_alpha = None self.opt_alpha = None self.mse_loss = torch.nn.MSELoss() self.t = 0
class SAC(Algorithm): """SAC algorithm.""" def __init__(self, logdir, env_fn, policy_fn, qf_fn, nenv=1, optimizer=torch.optim.Adam, buffer_size=10000, frame_stack=1, learning_starts=1000, update_period=1, batch_size=256, policy_lr=1e-3, qf_lr=1e-3, gamma=0.99, target_update_period=1, policy_update_period=1, target_smoothing_coef=0.005, alpha=0.2, automatic_entropy_tuning=True, target_entropy=None, gpu=True, eval_num_episodes=1, record_num_episodes=1, log_period=1000): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.gamma = gamma self.buffer_size = buffer_size self.frame_stack = frame_stack self.learning_starts = learning_starts self.update_period = update_period self.batch_size = batch_size if target_update_period < self.update_period: self.target_update_period = self.update_period else: self.target_update_period = target_update_period - ( target_update_period % self.update_period) if policy_update_period < self.update_period: self.policy_update_period = self.update_period else: self.policy_update_period = policy_update_period - ( policy_update_period % self.update_period) self.target_smoothing_coef = target_smoothing_coef self.log_period = log_period self.device = torch.device( 'cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.env = VecEpisodeLogger(env_fn(nenv=nenv)) eval_env = VecFrameStack(self.env, self.frame_stack) self.pi = policy_fn(eval_env) self.qf1 = qf_fn(eval_env) self.qf2 = qf_fn(eval_env) self.target_qf1 = qf_fn(eval_env) self.target_qf2 = qf_fn(eval_env) self.pi.to(self.device) self.qf1.to(self.device) self.qf2.to(self.device) self.target_qf1.to(self.device) self.target_qf2.to(self.device) self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr) self.opt_qf1 = optimizer(self.qf1.parameters(), lr=qf_lr) self.opt_qf2 = optimizer(self.qf2.parameters(), lr=qf_lr) self.target_qf1.load_state_dict(self.qf1.state_dict()) self.target_qf2.load_state_dict(self.qf2.state_dict()) self.buffer = BatchedReplayBuffer( * [ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv)]) self.data_manager = ReplayBufferDataManager(self.buffer, self.env, SACActor(self.pi), self.device, self.learning_starts, self.update_period) self.alpha = alpha self.automatic_entropy_tuning = automatic_entropy_tuning if self.automatic_entropy_tuning: if target_entropy: self.target_entropy = target_entropy else: target_entropies = nest.map_structure( lambda space: -np.prod(space.shape).item(), misc.unpack_space(self.env.action_space)) self.target_entropy = sum(nest.flatten(target_entropies)) self.log_alpha = torch.tensor(np.log([self.alpha]), requires_grad=True, device=self.device, dtype=torch.float32) self.opt_alpha = optimizer([self.log_alpha], lr=policy_lr) else: self.target_entropy = None self.log_alpha = None self.opt_alpha = None self.mse_loss = torch.nn.MSELoss() self.t = 0 def loss(self, batch): """Loss function.""" pi_out = self.pi(batch['obs'], reparameterization_trick=True) logp = pi_out.dist.log_prob(pi_out.action) q1 = self.qf1(batch['obs'], batch['action']).value q2 = self.qf2(batch['obs'], batch['action']).value # alpha loss if self.automatic_entropy_tuning: ent_error = logp + self.target_entropy alpha_loss = -(self.log_alpha * ent_error.detach()).mean() self.opt_alpha.zero_grad() alpha_loss.backward() self.opt_alpha.step() alpha = self.log_alpha.exp() else: alpha = self.alpha alpha_loss = 0 # qf loss with torch.no_grad(): next_pi_out = self.pi(batch['next_obs']) next_ac_logp = next_pi_out.dist.log_prob(next_pi_out.action) q1_next = self.target_qf1(batch['next_obs'], next_pi_out.action).value q2_next = self.target_qf2(batch['next_obs'], next_pi_out.action).value qnext = torch.min(q1_next, q2_next) - alpha * next_ac_logp qtarg = batch['reward'] + (1.0 - batch['done']) * self.gamma * qnext assert qtarg.shape == q1.shape assert qtarg.shape == q2.shape qf1_loss = self.mse_loss(q1, qtarg) qf2_loss = self.mse_loss(q2, qtarg) # pi loss pi_loss = None if self.t % self.policy_update_period == 0: q1_pi = self.qf1(batch['obs'], pi_out.action).value q2_pi = self.qf2(batch['obs'], pi_out.action).value min_q_pi = torch.min(q1_pi, q2_pi) assert min_q_pi.shape == logp.shape pi_loss = (alpha * logp - min_q_pi).mean() # log pi loss about as frequently as other losses if self.t % self.log_period < self.policy_update_period: logger.add_scalar('loss/pi', pi_loss, self.t, time.time()) if self.t % self.log_period < self.update_period: if self.automatic_entropy_tuning: logger.add_scalar('alg/log_alpha', self.log_alpha.detach().cpu().numpy(), self.t, time.time()) scalars = { "target": self.target_entropy, "entropy": -torch.mean(logp.detach()).cpu().numpy().item() } logger.add_scalars('alg/entropy', scalars, self.t, time.time()) else: logger.add_scalar( 'alg/entropy', -torch.mean(logp.detach()).cpu().numpy().item(), self.t, time.time()) logger.add_scalar('loss/qf1', qf1_loss, self.t, time.time()) logger.add_scalar('loss/qf2', qf2_loss, self.t, time.time()) logger.add_scalar('alg/qf1', q1.mean().detach().cpu().numpy(), self.t, time.time()) logger.add_scalar('alg/qf2', q2.mean().detach().cpu().numpy(), self.t, time.time()) return pi_loss, qf1_loss, qf2_loss def step(self): """Step optimization.""" self.t += self.data_manager.step_until_update() if self.t % self.target_update_period == 0: soft_target_update(self.target_qf1, self.qf1, self.target_smoothing_coef) soft_target_update(self.target_qf2, self.qf2, self.target_smoothing_coef) if self.t % self.update_period == 0: batch = self.data_manager.sample(self.batch_size) pi_loss, qf1_loss, qf2_loss = self.loss(batch) # update if pi_loss: self.opt_pi.zero_grad() pi_loss.backward() self.opt_pi.step() self.opt_qf1.zero_grad() qf1_loss.backward() self.opt_qf1.step() self.opt_qf2.zero_grad() qf2_loss.backward() self.opt_qf2.step() return self.t def evaluate(self): """Evaluate.""" eval_env = VecFrameStack(self.env, self.frame_stack) self.pi.eval() misc.set_env_to_eval_mode(eval_env) # Eval policy os.makedirs(os.path.join(self.logdir, 'eval'), exist_ok=True) outfile = os.path.join(self.logdir, 'eval', self.ckptr.format.format(self.t) + '.json') stats = rl_evaluate(eval_env, self.pi, self.eval_num_episodes, outfile, self.device) logger.add_scalar('eval/mean_episode_reward', stats['mean_reward'], self.t, time.time()) logger.add_scalar('eval/mean_episode_length', stats['mean_length'], self.t, time.time()) # Record policy os.makedirs(os.path.join(self.logdir, 'video'), exist_ok=True) outfile = os.path.join(self.logdir, 'video', self.ckptr.format.format(self.t) + '.mp4') rl_record(eval_env, self.pi, self.record_num_episodes, outfile, self.device) self.pi.train() misc.set_env_to_train_mode(self.env) self.data_manager.manual_reset() def save(self): """Save.""" state_dict = { 'pi': self.pi.state_dict(), 'qf1': self.qf1.state_dict(), 'qf2': self.qf2.state_dict(), 'target_qf1': self.target_qf1.state_dict(), 'target_qf2': self.target_qf2.state_dict(), 'opt_pi': self.opt_pi.state_dict(), 'opt_qf1': self.opt_qf1.state_dict(), 'opt_qf2': self.opt_qf2.state_dict(), 'log_alpha': (self.log_alpha if self.automatic_entropy_tuning else None), 'opt_alpha': (self.opt_alpha.state_dict() if self.automatic_entropy_tuning else None), 'env': misc.env_state_dict(self.env), 't': self.t } buffer_dict = self.buffer.state_dict() state_dict['buffer_format'] = nest.get_structure(buffer_dict) self.ckptr.save(state_dict, self.t) # save buffer seperately and only once (because it can be huge) np.savez( os.path.join(self.ckptr.ckptdir, 'buffer.npz'), **{f'{i:04d}': x for i, x in enumerate(nest.flatten(buffer_dict))}) def load(self, t=None): """Load.""" state_dict = self.ckptr.load(t) if state_dict is None: self.t = 0 return self.t self.pi.load_state_dict(state_dict['pi']) self.qf1.load_state_dict(state_dict['qf1']) self.qf2.load_state_dict(state_dict['qf2']) self.target_qf1.load_state_dict(state_dict['target_qf1']) self.target_qf2.load_state_dict(state_dict['target_qf2']) self.opt_pi.load_state_dict(state_dict['opt_pi']) self.opt_qf1.load_state_dict(state_dict['opt_qf1']) self.opt_qf2.load_state_dict(state_dict['opt_qf2']) if state_dict['log_alpha']: with torch.no_grad(): self.log_alpha.copy_(state_dict['log_alpha']) self.opt_alpha.load_state_dict(state_dict['opt_alpha']) misc.env_load_state_dict(self.env, state_dict['env']) self.t = state_dict['t'] buffer_format = state_dict['buffer_format'] buffer_state = dict( np.load(os.path.join(self.ckptr.ckptdir, 'buffer.npz'), allow_pickle=True)) buffer_state = nest.flatten(buffer_state) self.buffer.load_state_dict( nest.pack_sequence_as(buffer_state, buffer_format)) self.data_manager.manual_reset() return self.t def close(self): """Close environment.""" try: self.env.close() except Exception: pass
def __init__(self, logdir, env_fn, policy_fn, qf_fn, nenv=1, optimizer=torch.optim.Adam, buffer_size=10000, frame_stack=1, learning_starts=1000, update_period=1, batch_size=256, policy_lr=1e-4, qf_lr=1e-3, qf_weight_decay=0.01, gamma=0.99, noise_theta=0.15, noise_sigma=0.2, noise_sigma_final=0.01, noise_decay_period=10000, target_update_period=1, target_smoothing_coef=0.005, reward_scale=1, gpu=True, eval_num_episodes=1, record_num_episodes=1, log_period=1000): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.gamma = gamma self.buffer_size = buffer_size self.frame_stack = frame_stack self.learning_starts = learning_starts self.update_period = update_period self.batch_size = batch_size if target_update_period < self.update_period: self.target_update_period = self.update_period else: self.target_update_period = target_update_period - ( target_update_period % self.update_period) self.reward_scale = reward_scale self.target_smoothing_coef = target_smoothing_coef self.log_period = log_period self.device = torch.device( 'cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.t = 0 self.env = VecEpisodeLogger(env_fn(nenv=nenv)) self.policy_fn = policy_fn self.qf_fn = qf_fn eval_env = VecFrameStack(self.env, self.frame_stack) self.pi = policy_fn(eval_env) self.qf = qf_fn(eval_env) self.target_pi = policy_fn(eval_env) self.target_qf = qf_fn(eval_env) self.pi.to(self.device) self.qf.to(self.device) self.target_pi.to(self.device) self.target_qf.to(self.device) self.optimizer = optimizer self.policy_lr = policy_lr self.qf_lr = qf_lr self.qf_weight_decay = qf_weight_decay self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr) self.opt_qf = optimizer(self.qf.parameters(), lr=qf_lr, weight_decay=qf_weight_decay) self.target_pi.load_state_dict(self.pi.state_dict()) self.target_qf.load_state_dict(self.qf.state_dict()) self.noise_schedule = LinearSchedule(noise_decay_period, noise_sigma_final, noise_sigma) self._actor = DDPGActor(self.pi, self.env.action_space, noise_theta, self.noise_schedule.value(self.t)) self.buffer = BatchedReplayBuffer( * [ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv)]) self.data_manager = ReplayBufferDataManager(self.buffer, self.env, self._actor, self.device, self.learning_starts, self.update_period) self.qf_criterion = torch.nn.MSELoss() if self.env.action_space.__class__.__name__ == 'Discrete': raise ValueError("Action space must be continuous!")
class DDPG(Algorithm): """DDPG algorithm.""" def __init__(self, logdir, env_fn, policy_fn, qf_fn, nenv=1, optimizer=torch.optim.Adam, buffer_size=10000, frame_stack=1, learning_starts=1000, update_period=1, batch_size=256, policy_lr=1e-4, qf_lr=1e-3, qf_weight_decay=0.01, gamma=0.99, noise_theta=0.15, noise_sigma=0.2, noise_sigma_final=0.01, noise_decay_period=10000, target_update_period=1, target_smoothing_coef=0.005, reward_scale=1, gpu=True, eval_num_episodes=1, record_num_episodes=1, log_period=1000): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.gamma = gamma self.buffer_size = buffer_size self.frame_stack = frame_stack self.learning_starts = learning_starts self.update_period = update_period self.batch_size = batch_size if target_update_period < self.update_period: self.target_update_period = self.update_period else: self.target_update_period = target_update_period - ( target_update_period % self.update_period) self.reward_scale = reward_scale self.target_smoothing_coef = target_smoothing_coef self.log_period = log_period self.device = torch.device( 'cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.t = 0 self.env = VecEpisodeLogger(env_fn(nenv=nenv)) self.policy_fn = policy_fn self.qf_fn = qf_fn eval_env = VecFrameStack(self.env, self.frame_stack) self.pi = policy_fn(eval_env) self.qf = qf_fn(eval_env) self.target_pi = policy_fn(eval_env) self.target_qf = qf_fn(eval_env) self.pi.to(self.device) self.qf.to(self.device) self.target_pi.to(self.device) self.target_qf.to(self.device) self.optimizer = optimizer self.policy_lr = policy_lr self.qf_lr = qf_lr self.qf_weight_decay = qf_weight_decay self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr) self.opt_qf = optimizer(self.qf.parameters(), lr=qf_lr, weight_decay=qf_weight_decay) self.target_pi.load_state_dict(self.pi.state_dict()) self.target_qf.load_state_dict(self.qf.state_dict()) self.noise_schedule = LinearSchedule(noise_decay_period, noise_sigma_final, noise_sigma) self._actor = DDPGActor(self.pi, self.env.action_space, noise_theta, self.noise_schedule.value(self.t)) self.buffer = BatchedReplayBuffer( * [ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv)]) self.data_manager = ReplayBufferDataManager(self.buffer, self.env, self._actor, self.device, self.learning_starts, self.update_period) self.qf_criterion = torch.nn.MSELoss() if self.env.action_space.__class__.__name__ == 'Discrete': raise ValueError("Action space must be continuous!") def loss(self, batch): """Loss function.""" # compute QFunction loss. with torch.no_grad(): target_action = self.target_pi(batch['next_obs']).action target_q = self.target_qf(batch['next_obs'], target_action).value qtarg = self.reward_scale * batch['reward'].float() + ( (1.0 - batch['done']) * self.gamma * target_q) q = self.qf(batch['obs'], batch['action']).value assert qtarg.shape == q.shape qf_loss = self.qf_criterion(q, qtarg) # compute policy loss action = self.pi(batch['obs'], deterministic=True).action q = self.qf(batch['obs'], action).value pi_loss = -q.mean() # log losses if self.t % self.log_period < self.update_period: logger.add_scalar('loss/qf', qf_loss, self.t, time.time()) logger.add_scalar('loss/pi', pi_loss, self.t, time.time()) return pi_loss, qf_loss def step(self): """Step optimization.""" self._actor.update_sigma(self.noise_schedule.value(self.t)) self.t += self.data_manager.step_until_update() if self.t % self.target_update_period == 0: soft_target_update(self.target_pi, self.pi, self.target_smoothing_coef) soft_target_update(self.target_qf, self.qf, self.target_smoothing_coef) if self.t % self.update_period == 0: batch = self.data_manager.sample(self.batch_size) pi_loss, qf_loss = self.loss(batch) # update self.opt_pi.zero_grad() pi_loss.backward() self.opt_pi.step() self.opt_qf.zero_grad() qf_loss.backward() self.opt_qf.step() return self.t def evaluate(self): """Evaluate.""" eval_env = VecFrameStack(self.env, self.frame_stack) self.pi.eval() misc.set_env_to_eval_mode(eval_env) # Eval policy os.makedirs(os.path.join(self.logdir, 'eval'), exist_ok=True) outfile = os.path.join(self.logdir, 'eval', self.ckptr.format.format(self.t) + '.json') stats = rl_evaluate(eval_env, self.pi, self.eval_num_episodes, outfile, self.device) logger.add_scalar('eval/mean_episode_reward', stats['mean_reward'], self.t, time.time()) logger.add_scalar('eval/mean_episode_length', stats['mean_length'], self.t, time.time()) # Record policy os.makedirs(os.path.join(self.logdir, 'video'), exist_ok=True) outfile = os.path.join(self.logdir, 'video', self.ckptr.format.format(self.t) + '.mp4') rl_record(eval_env, self.pi, self.record_num_episodes, outfile, self.device) self.pi.train() misc.set_env_to_train_mode(self.env) self.data_manager.manual_reset() def save(self): """Save.""" state_dict = { 'pi': self.pi.state_dict(), 'qf': self.qf.state_dict(), 'target_pi': self.target_pi.state_dict(), 'target_qf': self.target_qf.state_dict(), 'opt_pi': self.opt_pi.state_dict(), 'opt_qf': self.opt_qf.state_dict(), 'env': misc.env_state_dict(self.env), 't': self.t } buffer_dict = self.buffer.state_dict() state_dict['buffer_format'] = nest.get_structure(buffer_dict) self.ckptr.save(state_dict, self.t) # save buffer seperately and only once (because it can be huge) np.savez( os.path.join(self.ckptr.ckptdir, 'buffer.npz'), **{f'{i:04d}': x for i, x in enumerate(nest.flatten(buffer_dict))}) def load(self, t=None): """Load.""" state_dict = self.ckptr.load(t) if state_dict is None: self.t = 0 return self.t self.pi.load_state_dict(state_dict['pi']) self.qf.load_state_dict(state_dict['qf']) self.target_pi.load_state_dict(state_dict['target_pi']) self.target_qf.load_state_dict(state_dict['target_qf']) self.opt_pi.load_state_dict(state_dict['opt_pi']) self.opt_qf.load_state_dict(state_dict['opt_qf']) misc.env_load_state_dict(self.env, state_dict['env']) self.t = state_dict['t'] buffer_format = state_dict['buffer_format'] buffer_state = dict( np.load(os.path.join(self.ckptr.ckptdir, 'buffer.npz'))) buffer_state = nest.flatten(buffer_state) self.buffer.load_state_dict( nest.pack_sequence_as(buffer_state, buffer_format)) self.data_manager.manual_reset() return self.t def close(self): """Close environment.""" try: self.env.close() except Exception: pass
def __init__(self, logdir, env_fn, qf_fn, nenv=1, optimizer=torch.optim.RMSprop, buffer_size=100000, frame_stack=1, learning_starts=10000, update_period=1, gamma=0.99, huber_loss=True, exploration_timesteps=1000000, final_eps=0.1, eval_eps=0.05, target_update_period=10000, batch_size=32, gpu=True, eval_num_episodes=1, record_num_episodes=1, log_period=10): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.gamma = gamma self.frame_stack = frame_stack self.buffer_size = buffer_size self.batch_size = batch_size self.learning_starts = learning_starts self.update_period = update_period self.eval_eps = eval_eps self.target_update_period = target_update_period - ( target_update_period % self.update_period) self.log_period = log_period self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.env = VecEpisodeLogger(env_fn(nenv=nenv)) stacked_env = VecFrameStack(env_fn(nenv=nenv), self.frame_stack) self.qf = qf_fn(stacked_env).to(self.device) self.qf_targ = qf_fn(stacked_env).to(self.device) self.opt = optimizer(self.qf.parameters()) if huber_loss: self.criterion = torch.nn.SmoothL1Loss(reduction='none') else: self.criterion = torch.nn.MSELoss(reduction='none') self.eps_schedule = LinearSchedule(exploration_timesteps, final_eps, 1.0) self._actor = EpsilonGreedyActor(self.qf, self.eps_schedule, self.env.action_space, self.nenv) self.buffer = BatchedReplayBuffer(*[ ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv) ]) self.data_manager = ReplayBufferDataManager(self.buffer, self.env, self._actor, self.device, self.learning_starts, self.update_period) self.t = 0
class DQN(Algorithm): """DQN algorithm.""" def __init__(self, logdir, env_fn, qf_fn, nenv=1, optimizer=torch.optim.RMSprop, buffer_size=100000, frame_stack=1, learning_starts=10000, update_period=1, gamma=0.99, huber_loss=True, exploration_timesteps=1000000, final_eps=0.1, eval_eps=0.05, target_update_period=10000, batch_size=32, gpu=True, eval_num_episodes=1, record_num_episodes=1, log_period=10): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.gamma = gamma self.frame_stack = frame_stack self.buffer_size = buffer_size self.batch_size = batch_size self.learning_starts = learning_starts self.update_period = update_period self.eval_eps = eval_eps self.target_update_period = target_update_period - ( target_update_period % self.update_period) self.log_period = log_period self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.env = VecEpisodeLogger(env_fn(nenv=nenv)) stacked_env = VecFrameStack(env_fn(nenv=nenv), self.frame_stack) self.qf = qf_fn(stacked_env).to(self.device) self.qf_targ = qf_fn(stacked_env).to(self.device) self.opt = optimizer(self.qf.parameters()) if huber_loss: self.criterion = torch.nn.SmoothL1Loss(reduction='none') else: self.criterion = torch.nn.MSELoss(reduction='none') self.eps_schedule = LinearSchedule(exploration_timesteps, final_eps, 1.0) self._actor = EpsilonGreedyActor(self.qf, self.eps_schedule, self.env.action_space, self.nenv) self.buffer = BatchedReplayBuffer(*[ ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv) ]) self.data_manager = ReplayBufferDataManager(self.buffer, self.env, self._actor, self.device, self.learning_starts, self.update_period) self.t = 0 def _compute_target(self, rew, next_ob, done): qtarg = self.qf_targ(next_ob).max_q return rew + (1.0 - done) * self.gamma * qtarg def _get_batch(self): return self.data_manager.sample(self.batch_size) def loss(self, batch): """Compute loss.""" q = self.qf(batch['obs'], batch['action']).value with torch.no_grad(): target = self._compute_target(batch['reward'], batch['next_obs'], batch['done']) assert target.shape == q.shape loss = self.criterion(target, q).mean() if self.t % self.log_period < self.update_period: logger.add_scalar('alg/maxq', torch.max(q).detach().cpu().numpy(), self.t, time.time()) logger.add_scalar('alg/loss', loss.detach().cpu().numpy(), self.t, time.time()) logger.add_scalar('alg/epsilon', self.eps_schedule.value(self._actor.t), self.t, time.time()) return loss def step(self): """Step.""" self.t += self.data_manager.step_until_update() if self.t % self.target_update_period == 0: self.qf_targ.load_state_dict(self.qf.state_dict()) self.opt.zero_grad() loss = self.loss(self._get_batch()) loss.backward() self.opt.step() return self.t def evaluate(self): """Evaluate.""" eval_env = VecEpsilonGreedy(VecFrameStack(self.env, self.frame_stack), self.eval_eps) self.qf.eval() misc.set_env_to_eval_mode(eval_env) # Eval policy os.makedirs(os.path.join(self.logdir, 'eval'), exist_ok=True) outfile = os.path.join(self.logdir, 'eval', self.ckptr.format.format(self.t) + '.json') stats = rl_evaluate(eval_env, self.qf, self.eval_num_episodes, outfile, self.device) logger.add_scalar('eval/mean_episode_reward', stats['mean_reward'], self.t, time.time()) logger.add_scalar('eval/mean_episode_length', stats['mean_length'], self.t, time.time()) # Record policy os.makedirs(os.path.join(self.logdir, 'video'), exist_ok=True) outfile = os.path.join(self.logdir, 'video', self.ckptr.format.format(self.t) + '.mp4') rl_record(eval_env, self.qf, self.record_num_episodes, outfile, self.device) self.qf.train() misc.set_env_to_train_mode(self.env) self.data_manager.manual_reset() def save(self): """Save.""" state_dict = { 'qf': self.qf.state_dict(), 'qf_targ': self.qf.state_dict(), 'opt': self.opt.state_dict(), '_actor': self._actor.state_dict(), 'env': misc.env_state_dict(self.env), 't': self.t } buffer_dict = self.buffer.state_dict() state_dict['buffer_format'] = nest.get_structure(buffer_dict) self.ckptr.save(state_dict, self.t) # save buffer seperately and only once (because it can be huge) np.savez(os.path.join(self.ckptr.ckptdir, 'buffer.npz'), **{f'{i:04d}': x for i, x in enumerate(nest.flatten(buffer_dict))}) def load(self, t=None): """Load.""" state_dict = self.ckptr.load(t) if state_dict is None: self.t = 0 return self.t self.qf.load_state_dict(state_dict['qf']) self.qf_targ.load_state_dict(state_dict['qf_targ']) self.opt.load_state_dict(state_dict['opt']) self._actor.load_state_dict(state_dict['_actor']) misc.env_load_state_dict(self.env, state_dict['env']) self.t = state_dict['t'] buffer_format = state_dict['buffer_format'] buffer_state = dict(np.load(os.path.join(self.ckptr.ckptdir, 'buffer.npz'))) buffer_state = nest.flatten(buffer_state) self.buffer.load_state_dict(nest.pack_sequence_as(buffer_state, buffer_format)) self.data_manager.manual_reset() return self.t def close(self): """Close environment.""" try: self.env.close() except Exception: pass
def __init__(self, logdir, env_fn, policy_fn, qf_fn, nenv=1, optimizer=torch.optim.Adam, buffer_size=int(1e6), frame_stack=1, learning_starts=10000, update_period=1, batch_size=256, lr=3e-4, policy_update_period=2, target_smoothing_coef=0.005, reward_scale=1, gamma=0.99, exploration_noise=0.1, policy_noise=0.2, policy_noise_clip=0.5, gpu=True, eval_num_episodes=1, record_num_episodes=1, log_period=1000): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.gamma = gamma self.buffer_size = buffer_size self.batch_size = batch_size self.frame_stack = frame_stack self.learning_starts = learning_starts self.update_period = update_period if policy_update_period < self.update_period: self.policy_update_period = self.update_period else: self.policy_update_period = policy_update_period - ( policy_update_period % self.update_period) self.reward_scale = reward_scale self.target_smoothing_coef = target_smoothing_coef self.exploration_noise = exploration_noise self.policy_noise = policy_noise self.policy_noise_clip = policy_noise_clip self.log_period = log_period self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.policy_fn = policy_fn self.qf_fn = qf_fn self.env = VecEpisodeLogger(env_fn(nenv=nenv)) eval_env = VecFrameStack(self.env, self.frame_stack) self.pi = policy_fn(eval_env) self.qf1 = qf_fn(eval_env) self.qf2 = qf_fn(eval_env) self.target_pi = policy_fn(eval_env) self.target_qf1 = qf_fn(eval_env) self.target_qf2 = qf_fn(eval_env) self.pi.to(self.device) self.qf1.to(self.device) self.qf2.to(self.device) self.target_pi.to(self.device) self.target_qf1.to(self.device) self.target_qf2.to(self.device) self.optimizer = optimizer self.lr = lr self.opt_pi = optimizer(self.pi.parameters(), lr=lr) self.opt_qf = optimizer(list(self.qf1.parameters()) + list(self.qf2.parameters()), lr=lr) self.target_pi.load_state_dict(self.pi.state_dict()) self.target_qf1.load_state_dict(self.qf1.state_dict()) self.target_qf2.load_state_dict(self.qf2.state_dict()) self._actor = TD3Actor(self.pi, self.env.action_space, exploration_noise) self.buffer = BatchedReplayBuffer(*[ ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv) ]) self.data_manager = ReplayBufferDataManager(self.buffer, self.env, self._actor, self.device, self.learning_starts, self.update_period) self.qf_criterion = torch.nn.MSELoss() if self.env.action_space.__class__.__name__ == 'Discrete': raise ValueError("Action space must be continuous!") self.low = torch.from_numpy(self.env.action_space.low).to(self.device) self.high = torch.from_numpy(self.env.action_space.high).to(self.device) self.t = 0
class TD3(Algorithm): """TD3 algorithm.""" def __init__(self, logdir, env_fn, policy_fn, qf_fn, nenv=1, optimizer=torch.optim.Adam, buffer_size=int(1e6), frame_stack=1, learning_starts=10000, update_period=1, batch_size=256, lr=3e-4, policy_update_period=2, target_smoothing_coef=0.005, reward_scale=1, gamma=0.99, exploration_noise=0.1, policy_noise=0.2, policy_noise_clip=0.5, gpu=True, eval_num_episodes=1, record_num_episodes=1, log_period=1000): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.gamma = gamma self.buffer_size = buffer_size self.batch_size = batch_size self.frame_stack = frame_stack self.learning_starts = learning_starts self.update_period = update_period if policy_update_period < self.update_period: self.policy_update_period = self.update_period else: self.policy_update_period = policy_update_period - ( policy_update_period % self.update_period) self.reward_scale = reward_scale self.target_smoothing_coef = target_smoothing_coef self.exploration_noise = exploration_noise self.policy_noise = policy_noise self.policy_noise_clip = policy_noise_clip self.log_period = log_period self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.policy_fn = policy_fn self.qf_fn = qf_fn self.env = VecEpisodeLogger(env_fn(nenv=nenv)) eval_env = VecFrameStack(self.env, self.frame_stack) self.pi = policy_fn(eval_env) self.qf1 = qf_fn(eval_env) self.qf2 = qf_fn(eval_env) self.target_pi = policy_fn(eval_env) self.target_qf1 = qf_fn(eval_env) self.target_qf2 = qf_fn(eval_env) self.pi.to(self.device) self.qf1.to(self.device) self.qf2.to(self.device) self.target_pi.to(self.device) self.target_qf1.to(self.device) self.target_qf2.to(self.device) self.optimizer = optimizer self.lr = lr self.opt_pi = optimizer(self.pi.parameters(), lr=lr) self.opt_qf = optimizer(list(self.qf1.parameters()) + list(self.qf2.parameters()), lr=lr) self.target_pi.load_state_dict(self.pi.state_dict()) self.target_qf1.load_state_dict(self.qf1.state_dict()) self.target_qf2.load_state_dict(self.qf2.state_dict()) self._actor = TD3Actor(self.pi, self.env.action_space, exploration_noise) self.buffer = BatchedReplayBuffer(*[ ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv) ]) self.data_manager = ReplayBufferDataManager(self.buffer, self.env, self._actor, self.device, self.learning_starts, self.update_period) self.qf_criterion = torch.nn.MSELoss() if self.env.action_space.__class__.__name__ == 'Discrete': raise ValueError("Action space must be continuous!") self.low = torch.from_numpy(self.env.action_space.low).to(self.device) self.high = torch.from_numpy(self.env.action_space.high).to(self.device) self.t = 0 def loss(self, batch): """Loss function.""" # compute QFunction loss. with torch.no_grad(): target_action = self.target_pi(batch['next_obs']).action noise = ( torch.randn_like(target_action) * self.policy_noise ).clamp(-self.policy_noise_clip, self.policy_noise_clip) target_action = (target_action + noise).clamp(-1., 1.) target_q1 = self.target_qf1(batch['next_obs'], target_action).value target_q2 = self.target_qf2(batch['next_obs'], target_action).value target_q = torch.min(target_q1, target_q2) qtarg = self.reward_scale * batch['reward'].float() + ( (1.0 - batch['done']) * self.gamma * target_q) q1 = self.qf1(batch['obs'], batch['action']).value q2 = self.qf2(batch['obs'], batch['action']).value assert qtarg.shape == q1.shape assert qtarg.shape == q2.shape qf_loss = self.qf_criterion(q1, qtarg) + self.qf_criterion(q2, qtarg) # compute policy loss if self.t % self.policy_update_period == 0: action = self.pi(batch['obs'], deterministic=True).action q = self.qf1(batch['obs'], action).value pi_loss = -q.mean() else: pi_loss = torch.zeros_like(qf_loss) # log losses if self.t % self.log_period < self.update_period: logger.add_scalar('loss/qf', qf_loss, self.t, time.time()) if self.t % self.policy_update_period == 0: logger.add_scalar('loss/pi', pi_loss, self.t, time.time()) return pi_loss, qf_loss def step(self): """Step optimization.""" self.t += self.data_manager.step_until_update() batch = self.data_manager.sample(self.batch_size) pi_loss, qf_loss = self.loss(batch) # update self.opt_qf.zero_grad() qf_loss.backward() self.opt_qf.step() if self.t % self.policy_update_period == 0: self.opt_pi.zero_grad() pi_loss.backward() self.opt_pi.step() # update target networks soft_target_update(self.target_pi, self.pi, self.target_smoothing_coef) soft_target_update(self.target_qf1, self.qf1, self.target_smoothing_coef) soft_target_update(self.target_qf2, self.qf2, self.target_smoothing_coef) return self.t def evaluate(self): """Evaluate.""" eval_env = VecFrameStack(self.env, self.frame_stack) self.pi.eval() misc.set_env_to_eval_mode(eval_env) # Eval policy os.makedirs(os.path.join(self.logdir, 'eval'), exist_ok=True) outfile = os.path.join(self.logdir, 'eval', self.ckptr.format.format(self.t) + '.json') stats = rl_evaluate(eval_env, self.pi, self.eval_num_episodes, outfile, self.device) logger.add_scalar('eval/mean_episode_reward', stats['mean_reward'], self.t, time.time()) logger.add_scalar('eval/mean_episode_length', stats['mean_length'], self.t, time.time()) # Record policy os.makedirs(os.path.join(self.logdir, 'video'), exist_ok=True) outfile = os.path.join(self.logdir, 'video', self.ckptr.format.format(self.t) + '.mp4') rl_record(eval_env, self.pi, self.record_num_episodes, outfile, self.device) self.pi.train() misc.set_env_to_train_mode(self.env) self.data_manager.manual_reset() def save(self): """Save.""" state_dict = { 'pi': self.pi.state_dict(), 'qf1': self.qf1.state_dict(), 'qf2': self.qf2.state_dict(), 'target_pi': self.target_pi.state_dict(), 'target_qf1': self.target_qf1.state_dict(), 'target_qf2': self.target_qf2.state_dict(), 'opt_pi': self.opt_pi.state_dict(), 'opt_qf': self.opt_qf.state_dict(), 'env': misc.env_state_dict(self.env), 't': self.t } buffer_dict = self.buffer.state_dict() state_dict['buffer_format'] = nest.get_structure(buffer_dict) self.ckptr.save(state_dict, self.t) # save buffer seperately and only once (because it can be huge) np.savez(os.path.join(self.ckptr.ckptdir, 'buffer.npz'), **{f'{i:04d}': x for i, x in enumerate(nest.flatten(buffer_dict))}) def load(self, t=None): """Load.""" state_dict = self.ckptr.load(t) if state_dict is None: self.t = 0 return self.t self.pi.load_state_dict(state_dict['pi']) self.qf1.load_state_dict(state_dict['qf1']) self.qf2.load_state_dict(state_dict['qf2']) self.target_pi.load_state_dict(state_dict['target_pi']) self.target_qf1.load_state_dict(state_dict['target_qf1']) self.target_qf2.load_state_dict(state_dict['target_qf2']) self.opt_pi.load_state_dict(state_dict['opt_pi']) self.opt_qf.load_state_dict(state_dict['opt_qf']) misc.env_load_state_dict(self.env, state_dict['env']) self.t = state_dict['t'] buffer_format = state_dict['buffer_format'] buffer_state = dict(np.load(os.path.join(self.ckptr.ckptdir, 'buffer.npz'))) buffer_state = nest.flatten(buffer_state) self.buffer.load_state_dict(nest.pack_sequence_as(buffer_state, buffer_format)) self.data_manager.manual_reset() return self.t def close(self): """Close environment.""" try: self.env.close() except Exception: pass