def _initialize(self): """Initialize non-common things.""" if not self.args.test: # load demo replay memory demos = self._load_demos() if self.use_n_step: demos, demos_n_step = common_utils.get_n_step_info_from_demo( demos, self.hyper_params["N_STEP"], self.hyper_params["GAMMA"]) self.memory_n = ReplayBuffer( buffer_size=self.hyper_params["BUFFER_SIZE"], n_step=self.hyper_params["N_STEP"], gamma=self.hyper_params["GAMMA"], demo=demos_n_step, ) # replay memory self.beta = self.hyper_params["PER_BETA"] self.memory = PrioritizedReplayBuffer( self.hyper_params["BUFFER_SIZE"], self.hyper_params["BATCH_SIZE"], demo=demos, alpha=self.hyper_params["PER_ALPHA"], epsilon_d=self.hyper_params["PER_EPS_DEMO"], )
def _initialize(self): """Initialize non-common things.""" self.use_n_step = self.hyper_params["N_STEP"] > 1 if not self.args.test: # load demo replay memory with open(self.args.demo_path, "rb") as f: demos = pickle.load(f) if self.use_n_step: demos, demos_n_step = common_utils.get_n_step_info_from_demo( demos, self.hyper_params["N_STEP"], self.hyper_params["GAMMA"] ) # replay memory for multi-steps self.memory_n = ReplayBuffer( buffer_size=self.hyper_params["BUFFER_SIZE"], n_step=self.hyper_params["N_STEP"], gamma=self.hyper_params["GAMMA"], demo=demos_n_step, ) # replay memory for a single step self.beta = self.hyper_params["PER_BETA"] self.memory = PrioritizedReplayBuffer( self.hyper_params["BUFFER_SIZE"], self.hyper_params["BATCH_SIZE"], demo=demos, alpha=self.hyper_params["PER_ALPHA"], epsilon_d=self.hyper_params["PER_EPS_DEMO"], )
def _initialize(self): """Initialize non-common things.""" if not self.args.test: # replay memory self.beta = self.hyper_params["PER_BETA"] self.memory = PrioritizedReplayBuffer( self.hyper_params["BUFFER_SIZE"], self.hyper_params["BATCH_SIZE"], alpha=self.hyper_params["PER_ALPHA"], )
def _initialize(self): """Initialize non-common things.""" if not self.args.test: # replay memory for a single step self.beta = self.hyper_params["PER_BETA"] self.memory = PrioritizedReplayBuffer( self.hyper_params["BUFFER_SIZE"], self.hyper_params["BATCH_SIZE"], alpha=self.hyper_params["PER_ALPHA"], ) # replay memory for multi-steps if self.use_n_step: self.memory_n = NStepTransitionBuffer( self.hyper_params["BUFFER_SIZE"], n_step=self.hyper_params["N_STEP"], gamma=self.hyper_params["GAMMA"], )
class DQfDAgent(DQNAgent): """DQN interacting with environment. Attribute: memory (PrioritizedReplayBuffer): replay memory """ # pylint: disable=attribute-defined-outside-init def _initialize(self): """Initialize non-common things.""" if not self.args.test: # load demo replay memory demos = self._load_demos() if self.use_n_step: demos, demos_n_step = common_utils.get_n_step_info_from_demo( demos, self.hyper_params["N_STEP"], self.hyper_params["GAMMA"]) self.memory_n = ReplayBuffer( buffer_size=self.hyper_params["BUFFER_SIZE"], n_step=self.hyper_params["N_STEP"], gamma=self.hyper_params["GAMMA"], demo=demos_n_step, ) # replay memory self.beta = self.hyper_params["PER_BETA"] self.memory = PrioritizedReplayBuffer( self.hyper_params["BUFFER_SIZE"], self.hyper_params["BATCH_SIZE"], demo=demos, alpha=self.hyper_params["PER_ALPHA"], epsilon_d=self.hyper_params["PER_EPS_DEMO"], ) def _load_demos(self) -> list: """Load expert's demonstrations.""" # load demo replay memory with open(self.args.demo_path, "rb") as f: demos = pickle.load(f) return demos def update_model(self) -> Tuple[torch.Tensor, ...]: """Train the model after each episode.""" experiences_1 = self.memory.sample() weights, indices, eps_d = experiences_1[-3:] actions = experiences_1[1] # 1 step loss gamma = self.hyper_params["GAMMA"] dq_loss_element_wise, q_values = self._get_dqn_loss( experiences_1, gamma) dq_loss = torch.mean(dq_loss_element_wise * weights) # n step loss if self.use_n_step: experiences_n = self.memory_n.sample(indices) gamma = self.hyper_params["GAMMA"]**self.hyper_params["N_STEP"] dq_loss_n_element_wise, q_values_n = self._get_dqn_loss( experiences_n, gamma) # to update loss and priorities q_values = 0.5 * (q_values + q_values_n) dq_loss_element_wise += (dq_loss_n_element_wise * self.hyper_params["LAMBDA1"]) dq_loss = torch.mean(dq_loss_element_wise * weights) # supervised loss using demo for only demo transitions demo_idxs = np.where(eps_d != 0.0) n_demo = demo_idxs[0].size if n_demo != 0: # if 1 or more demos are sampled # get margin for each demo transition action_idxs = actions[demo_idxs].long() margin = torch.ones(q_values.size()) * self.hyper_params["MARGIN"] margin[demo_idxs, action_idxs] = 0.0 # demo actions have 0 margins margin = margin.to(device) # calculate supervised loss demo_q_values = q_values[demo_idxs, action_idxs].squeeze() supervised_loss = torch.max(q_values + margin, dim=-1)[0] supervised_loss = supervised_loss[demo_idxs] - demo_q_values supervised_loss = torch.mean( supervised_loss) * self.hyper_params["LAMBDA2"] else: # no demo sampled supervised_loss = torch.zeros(1, device=device) # q_value regularization q_regular = torch.norm(q_values, 2).mean() * self.hyper_params["W_Q_REG"] # total loss loss = dq_loss + supervised_loss + q_regular # train dqn self.dqn_optimizer.zero_grad() loss.backward() clip_grad_norm_(self.dqn.parameters(), self.hyper_params["GRADIENT_CLIP"]) self.dqn_optimizer.step() # update target networks tau = self.hyper_params["TAU"] common_utils.soft_update(self.dqn, self.dqn_target, tau) # update priorities in PER loss_for_prior = dq_loss_element_wise.detach().cpu().numpy().squeeze() new_priorities = loss_for_prior + self.hyper_params["PER_EPS"] new_priorities += eps_d self.memory.update_priorities(indices, new_priorities) # increase beta fraction = min(float(self.i_episode) / self.args.episode_num, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) if self.hyper_params["USE_NOISY_NET"]: self.dqn.reset_noise() self.dqn_target.reset_noise() return ( loss.item(), dq_loss.item(), supervised_loss.item(), q_values.mean().item(), n_demo, ) def write_log(self, i: int, avg_loss: np.ndarray, score: float, avg_time_cost: float): """Write log about loss and score""" print( "[INFO] episode %d, episode step: %d, total step: %d, total score: %f\n" "epsilon: %f, total loss: %f, dq loss: %f, supervised loss: %f\n" "avg q values: %f, demo num in minibatch: %d (spent %.6f sec/step)\n" % ( i, self.episode_step, self.total_step, score, self.epsilon, avg_loss[0], avg_loss[1], avg_loss[2], avg_loss[3], avg_loss[4], avg_time_cost, )) if self.args.log: wandb.log({ "score": score, "epsilon": self.epsilon, "total loss": avg_loss[0], "dq loss": avg_loss[1], "supervised loss": avg_loss[2], "avg q values": avg_loss[3], "demo num in minibatch": avg_loss[4], "time per each step": avg_time_cost, }) def pretrain(self): """Pretraining steps.""" pretrain_loss = list() print("[INFO] Pre-Train %d step." % self.hyper_params["PRETRAIN_STEP"]) for i_step in range(1, self.hyper_params["PRETRAIN_STEP"] + 1): t_begin = time.time() loss = self.update_model() t_end = time.time() pretrain_loss.append(loss) # for logging # logging if i_step == 1 or i_step % 100 == 0: avg_loss = np.vstack(pretrain_loss).mean(axis=0) pretrain_loss.clear() self.write_log(0, avg_loss, 0.0, t_end - t_begin) print("[INFO] Pre-Train Complete!\n")
class DQNAgent(Agent): """DQN interacting with environment. Attribute: memory (PrioritizedReplayBuffer): replay memory dqn (nn.Module): actor model to select actions dqn_target (nn.Module): target actor model to select actions dqn_optimizer (Optimizer): optimizer for training actor hyper_params (dict): hyper-parameters beta (float): beta parameter for prioritized replay buffer curr_state (np.ndarray): temporary storage of the current state total_step (int): total step number episode_step (int): step number of the current episode epsilon (float): parameter for epsilon greedy policy i_episode (int): current episode number n_step_buffer (deque): n-size buffer to calculate n-step returns use_n_step (bool): whether or not to use n-step returns """ def __init__( self, env: gym.Env, args: argparse.Namespace, hyper_params: dict, models: tuple, optim: torch.optim.Adam, ): """Initialization. Args: env (gym.Env): openAI Gym environment args (argparse.Namespace): arguments including hyperparameters and training settings hyper_params (dict): hyper-parameters models (tuple): models including main network and target optim (torch.optim.Adam): optimizers for dqn """ Agent.__init__(self, env, args) self.use_n_step = hyper_params["N_STEP"] > 1 self.epsilon = hyper_params["MAX_EPSILON"] self.dqn, self.dqn_target = models self.hyper_params = hyper_params self.curr_state = np.zeros(1) self.dqn_optimizer = optim self.episode_step = 0 self.total_step = 0 self.i_episode = 0 # load the optimizer and model parameters if args.load_from is not None and os.path.exists(args.load_from): self.load_params(args.load_from) self._initialize() # pylint: disable=attribute-defined-outside-init def _initialize(self): """Initialize non-common things.""" if not self.args.test: # replay memory for a single step self.beta = self.hyper_params["PER_BETA"] self.memory = PrioritizedReplayBuffer( self.hyper_params["BUFFER_SIZE"], self.hyper_params["BATCH_SIZE"], alpha=self.hyper_params["PER_ALPHA"], ) # replay memory for multi-steps if self.use_n_step: self.memory_n = NStepTransitionBuffer( self.hyper_params["BUFFER_SIZE"], n_step=self.hyper_params["N_STEP"], gamma=self.hyper_params["GAMMA"], ) def select_action(self, state: np.ndarray) -> np.ndarray: """Select an action from the input space.""" self.curr_state = state # epsilon greedy policy # pylint: disable=comparison-with-callable if not self.args.test and self.epsilon > np.random.random(): selected_action = self.env.action_space.sample() else: state = self._preprocess_state(state) selected_action = self.dqn(state).argmax() selected_action = selected_action.detach().cpu().numpy() return selected_action # pylint: disable=no-self-use def _preprocess_state(self, state: np.ndarray) -> torch.Tensor: """Preprocess state so that actor selects an action.""" state = torch.FloatTensor(state).to(device) return state def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool, dict]: """Take an action and return the response of the env.""" next_state, reward, done, info = self.env.step(action) if not self.args.test: # if the last state is not a terminal state, store done as false done_bool = ( False if self.episode_step == self.args.max_episode_steps else done ) transition = (self.curr_state, action, reward, next_state, done_bool) self._add_transition_to_memory(transition) return next_state, reward, done, info def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]): """Add 1 step and n step transitions to memory.""" # add n-step transition if self.use_n_step: transition = self.memory_n.add(transition) # add a single step transition # if transition is not an empty tuple if transition: self.memory.add(*transition) def _get_dqn_loss( self, experiences: Tuple[torch.Tensor, ...], gamma: float ) -> Tuple[torch.Tensor, torch.Tensor]: """Return element-wise dqn loss and Q-values.""" if self.hyper_params["USE_DIST_Q"] == "IQN": return dqn_utils.calculate_iqn_loss( model=self.dqn, target_model=self.dqn_target, experiences=experiences, gamma=gamma, batch_size=self.hyper_params["BATCH_SIZE"], n_tau_samples=self.hyper_params["N_TAU_SAMPLES"], n_tau_prime_samples=self.hyper_params["N_TAU_PRIME_SAMPLES"], kappa=self.hyper_params["KAPPA"], ) elif self.hyper_params["USE_DIST_Q"] == "C51": return dqn_utils.calculate_c51_loss( model=self.dqn, target_model=self.dqn_target, experiences=experiences, gamma=gamma, batch_size=self.hyper_params["BATCH_SIZE"], v_min=self.hyper_params["V_MIN"], v_max=self.hyper_params["V_MAX"], atom_size=self.hyper_params["ATOMS"], ) else: return dqn_utils.calculate_dqn_loss( model=self.dqn, target_model=self.dqn_target, experiences=experiences, gamma=gamma, ) def update_model(self) -> Tuple[torch.Tensor, torch.Tensor]: """Train the model after each episode.""" # 1 step loss experiences_1 = self.memory.sample(self.beta) weights, indices = experiences_1[-2:] gamma = self.hyper_params["GAMMA"] dq_loss_element_wise, q_values = self._get_dqn_loss(experiences_1, gamma) dq_loss = torch.mean(dq_loss_element_wise * weights) # n step loss if self.use_n_step: experiences_n = self.memory_n.sample(indices) gamma = self.hyper_params["GAMMA"] ** self.hyper_params["N_STEP"] dq_loss_n_element_wise, q_values_n = self._get_dqn_loss( experiences_n, gamma ) # to update loss and priorities q_values = 0.5 * (q_values + q_values_n) dq_loss_element_wise += ( dq_loss_n_element_wise * self.hyper_params["W_N_STEP"] ) dq_loss = torch.mean(dq_loss_element_wise * weights) # q_value regularization q_regular = torch.norm(q_values, 2).mean() * self.hyper_params["W_Q_REG"] # total loss loss = dq_loss + q_regular self.dqn_optimizer.zero_grad() loss.backward() clip_grad_norm_(self.dqn.parameters(), self.hyper_params["GRADIENT_CLIP"]) self.dqn_optimizer.step() # update target networks tau = self.hyper_params["TAU"] common_utils.soft_update(self.dqn, self.dqn_target, tau) # update priorities in PER loss_for_prior = dq_loss_element_wise.detach().cpu().numpy() new_priorities = loss_for_prior + self.hyper_params["PER_EPS"] self.memory.update_priorities(indices, new_priorities) # increase beta fraction = min(float(self.i_episode) / self.args.episode_num, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) if self.hyper_params["USE_NOISY_NET"]: self.dqn.reset_noise() self.dqn_target.reset_noise() return loss.item(), q_values.mean().item() def load_params(self, path: str): """Load model and optimizer parameters.""" if not os.path.exists(path): print("[ERROR] the input path does not exist. ->", path) return params = torch.load(path) self.dqn.load_state_dict(params["dqn_state_dict"]) self.dqn_target.load_state_dict(params["dqn_target_state_dict"]) self.dqn_optimizer.load_state_dict(params["dqn_optim_state_dict"]) print("[INFO] loaded the model and optimizer from", path) def save_params(self, n_episode: int): """Save model and optimizer parameters.""" params = { "dqn_state_dict": self.dqn.state_dict(), "dqn_target_state_dict": self.dqn_target.state_dict(), "dqn_optim_state_dict": self.dqn_optimizer.state_dict(), } Agent.save_params(self, params, n_episode) def write_log(self, i: int, loss: np.ndarray, score: float, avg_time_cost: float): """Write log about loss and score""" print( "[INFO] episode %d, episode step: %d, total step: %d, total score: %f\n" "epsilon: %f, loss: %f, avg q-value: %f (spent %.6f sec/step)\n" % ( i, self.episode_step, self.total_step, score, self.epsilon, loss[0], loss[1], avg_time_cost, ) ) if self.args.log: wandb.log( { "score": score, "epsilon": self.epsilon, "dqn loss": loss[0], "avg q values": loss[1], "time per each step": avg_time_cost, } ) # pylint: disable=no-self-use, unnecessary-pass def pretrain(self): """Pretraining steps.""" pass def train(self): """Train the agent.""" # logger if self.args.log: wandb.init(project=self.args.wandb_project) wandb.config.update(self.hyper_params) # wandb.watch([self.dqn], log="parameters") # pre-training if needed self.pretrain() max_epsilon, min_epsilon, epsilon_decay = ( self.hyper_params["MAX_EPSILON"], self.hyper_params["MIN_EPSILON"], self.hyper_params["EPSILON_DECAY"], ) for self.i_episode in range(1, self.args.episode_num + 1): state = self.env.reset() self.episode_step = 0 losses = list() done = False score = 0 t_begin = time.time() while not done: if self.args.render and self.i_episode >= self.args.render_after: self.env.render() action = self.select_action(state) next_state, reward, done, _ = self.step(action) self.total_step += 1 self.episode_step += 1 if len(self.memory) >= self.hyper_params["UPDATE_STARTS_FROM"]: if self.total_step % self.hyper_params["TRAIN_FREQ"] == 0: for _ in range(self.hyper_params["MULTIPLE_LEARN"]): loss = self.update_model() losses.append(loss) # for logging # decrease epsilon self.epsilon = max( self.epsilon - (max_epsilon - min_epsilon) * epsilon_decay, min_epsilon, ) state = next_state score += reward t_end = time.time() avg_time_cost = (t_end - t_begin) / self.episode_step if losses: avg_loss = np.vstack(losses).mean(axis=0) self.write_log(self.i_episode, avg_loss, score, avg_time_cost) if self.i_episode % self.args.save_period == 0: self.save_params(self.i_episode) self.interim_test() # termination self.env.close() self.save_params(self.i_episode) self.interim_test()
class Agent(AbstractAgent): """DQN interacting with environment. Attribute: memory (PrioritizedReplayBuffer): replay memory dqn (nn.Module): actor model to select actions dqn_target (nn.Module): target actor model to select actions dqn_optimizer (Optimizer): optimizer for training actor hyper_params (dict): hyper-parameters beta (float): beta parameter for prioritized replay buffer curr_state (np.ndarray): temporary storage of the current state total_steps (np.ndarray): total step numbers episode_steps (np.ndarray): step number of the current episode epsilon (float): parameter for epsilon greedy policy i_episode (int): current episode number """ def __init__( self, env_single: gym.Env, env_multi: SubprocVecEnv, args: argparse.Namespace, hyper_params: dict, models: tuple, optim: torch.optim.Adam, ): """Initialization. Args: env_single (gym.Env): openAI Gym environment env_multi (SubprocVecEnv): Gym env with multiprocessing for training args (argparse.Namespace): arguments including hyperparameters and training settings hyper_params (dict): hyper-parameters models (tuple): models including main network and target optim (torch.optim.Adam): optimizers for dqn """ AbstractAgent.__init__(self, env_single, args) if not self.args.test: self.env = env_multi self.dqn, self.dqn_target = models self.dqn_optimizer = optim self.hyper_params = hyper_params self.curr_state = np.zeros((1, )) self.total_steps = np.zeros(hyper_params["N_WORKERS"], dtype=np.int) self.episode_steps = np.zeros(hyper_params["N_WORKERS"], dtype=np.int) self.epsilon = self.hyper_params["MAX_EPSILON"] self.i_episode = 0 # load the optimizer and model parameters if args.load_from is not None and os.path.exists(args.load_from): self.load_params(args.load_from) self._initialize() # pylint: disable=attribute-defined-outside-init def _initialize(self): """Initialize non-common things.""" if not self.args.test: # replay memory self.beta = self.hyper_params["PER_BETA"] self.memory = PrioritizedReplayBuffer( self.hyper_params["BUFFER_SIZE"], self.hyper_params["BATCH_SIZE"], alpha=self.hyper_params["PER_ALPHA"], ) def select_action(self, state: np.ndarray) -> np.ndarray: """Select an action from the input space.""" self.curr_state = state # epsilon greedy policy # pylint: disable=comparison-with-callable if not self.args.test and self.epsilon > np.random.random(): selected_action = self.env.sample() else: state = torch.FloatTensor(state).to(device) selected_action = self.dqn(state, self.epsilon).argmax(dim=-1) selected_action = selected_action.detach().cpu().numpy() return selected_action def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool]: """Take an action and return the response of the env.""" self.total_steps += 1 self.episode_steps += 1 next_state, reward, done, _ = self.env.step(action) if not self.args.test: # if the last state is not a terminal state, store done as false done_bool = done.copy() done_bool[np.where( self.episode_steps == self.args.max_episode_steps)] = False action = action.tolist() reward = reward.tolist() done_bool = done_bool.tolist() for s, a, r, n_s, d in zip(self.curr_state, action, reward, next_state, done_bool): self.memory.add(s, a, r, n_s, d) return next_state, reward, done def update_model(self) -> Tuple[torch.Tensor, ...]: """Train the model after each episode.""" experiences = self.memory.sample(self.beta) states, actions, rewards, next_states, dones, weights, indexes = experiences q_values = self.dqn(states, self.epsilon) next_q_values = self.dqn(next_states, self.epsilon) next_target_q_values = self.dqn_target(next_states, self.epsilon) curr_q_value = q_values.gather(1, actions.long().unsqueeze(1)) next_q_value = next_target_q_values.gather( # Double DQN 1, next_q_values.argmax(1).unsqueeze(1)) # G_t = r + gamma * v(s_{t+1}) if state != Terminal # = r otherwise masks = 1 - dones target = rewards + self.hyper_params["GAMMA"] * next_q_value * masks target = target.to(device) # calculate dq loss dq_loss_element_wise = (target - curr_q_value).pow(2) dq_loss = torch.mean(dq_loss_element_wise * weights) # q_value regularization q_regular = torch.norm(q_values, 2).mean() * self.hyper_params["W_Q_REG"] # total loss loss = dq_loss + q_regular self.dqn_optimizer.zero_grad() loss.backward() clip_grad_norm_(self.dqn.parameters(), self.hyper_params["GRADIENT_CLIP"]) self.dqn_optimizer.step() # update target networks tau = self.hyper_params["TAU"] common_utils.soft_update(self.dqn, self.dqn_target, tau) # update priorities in PER loss_for_prior = dq_loss_element_wise.detach().cpu().numpy().squeeze() new_priorities = loss_for_prior + self.hyper_params["PER_EPS"] self.memory.update_priorities(indexes, new_priorities) # increase beta fraction = min( float(self.i_episode) / self.args.max_episode_steps, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) return loss.data def load_params(self, path: str): """Load model and optimizer parameters.""" if not os.path.exists(path): print("[ERROR] the input path does not exist. ->", path) return params = torch.load(path) self.dqn.load_state_dict(params["dqn_state_dict"]) self.dqn_target.load_state_dict(params["dqn_target_state_dict"]) self.dqn_optimizer.load_state_dict(params["dqn_optim_state_dict"]) print("[INFO] loaded the model and optimizer from", path) def save_params(self, n_episode: int): """Save model and optimizer parameters.""" params = { "dqn_state_dict": self.dqn.state_dict(), "dqn_target_state_dict": self.dqn_target.state_dict(), "dqn_optim_state_dict": self.dqn_optimizer.state_dict(), } AbstractAgent.save_params(self, params, n_episode) def write_log(self, i: int, loss: np.ndarray, score: int): """Write log about loss and score""" print( "[INFO] episode %d, episode step: %d, total step: %d, total score: %d\n" "epsilon: %f, loss: %f, at %s\n" % ( i, self.episode_steps[0], self.total_steps.sum(), score, self.epsilon, loss, datetime.datetime.now(), )) if self.args.log: wandb.log({ "score": score, "dqn loss": loss, "epsilon": self.epsilon }) # pylint: disable=no-self-use, unnecessary-pass def pretrain(self): """Pretraining steps.""" pass def train(self): """Train the agent.""" # logger if self.args.log: wandb.init() wandb.config.update(self.hyper_params) # wandb.watch([self.dqn], log="parameters") # pre-training if needed self.pretrain() state = self.env.reset() i_episode_prev = 0 losses = list() i_episode = 0 score = 0 while i_episode <= self.args.episode_num: if self.args.render and i_episode >= self.args.render_after: self.env.render() action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward[0] i_episode_prev = i_episode i_episode += done.sum() self.i_episode = i_episode if (i_episode // self.args.save_period) != (i_episode_prev // self.args.save_period): self.save_params(i_episode) if done[0]: if losses: avg_loss = np.vstack(losses).mean(axis=0) self.write_log(i_episode, avg_loss, score) losses.clear() score = 0 self.episode_steps[np.where(done)] = 0 if len(self.memory) >= self.hyper_params["UPDATE_STARTS_FROM"]: for _ in range(self.hyper_params["MULTIPLE_LEARN"]): loss = self.update_model() losses.append(loss) # for logging # decrease epsilon max_epsilon, min_epsilon, epsilon_decay, n_workers = ( self.hyper_params["MAX_EPSILON"], self.hyper_params["MIN_EPSILON"], self.hyper_params["EPSILON_DECAY"], self.hyper_params["N_WORKERS"], ) self.epsilon = max( self.epsilon - (max_epsilon - min_epsilon) * epsilon_decay * n_workers, min_epsilon, ) # termination self.env.close() self.save_params(i_episode)
class DDPGfDAgent(DDPGAgent): """ActorCritic interacting with environment. Attributes: memory (PrioritizedReplayBuffer): replay memory beta (float): beta parameter for prioritized replay buffer """ # pylint: disable=attribute-defined-outside-init def _initialize(self): """Initialize non-common things.""" self.use_n_step = self.hyper_params["N_STEP"] > 1 if not self.args.test: # load demo replay memory with open(self.args.demo_path, "rb") as f: demos = pickle.load(f) if self.use_n_step: demos, demos_n_step = common_utils.get_n_step_info_from_demo( demos, self.hyper_params["N_STEP"], self.hyper_params["GAMMA"] ) # replay memory for multi-steps self.memory_n = ReplayBuffer( buffer_size=self.hyper_params["BUFFER_SIZE"], n_step=self.hyper_params["N_STEP"], gamma=self.hyper_params["GAMMA"], demo=demos_n_step, ) # replay memory for a single step self.beta = self.hyper_params["PER_BETA"] self.memory = PrioritizedReplayBuffer( self.hyper_params["BUFFER_SIZE"], self.hyper_params["BATCH_SIZE"], demo=demos, alpha=self.hyper_params["PER_ALPHA"], epsilon_d=self.hyper_params["PER_EPS_DEMO"], ) def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]): """Add 1 step and n step transitions to memory.""" # add n-step transition if self.use_n_step: transition = self.memory_n.add(transition) # add a single step transition # if transition is not an empty tuple if transition: self.memory.add(transition) def _get_critic_loss( self, experiences: Tuple[torch.Tensor, ...], gamma: float ) -> torch.Tensor: """Return element-wise critic loss.""" states, actions, rewards, next_states, dones = experiences[:5] # G_t = r + gamma * v(s_{t+1}) if state != Terminal # = r otherwise masks = 1 - dones next_actions = self.actor_target(next_states) next_states_actions = torch.cat((next_states, next_actions), dim=-1) next_values = self.critic_target(next_states_actions) curr_returns = rewards + gamma * next_values * masks curr_returns = curr_returns.to(device).detach() # train critic values = self.critic(torch.cat((states, actions), dim=-1)) critic_loss_element_wise = (values - curr_returns).pow(2) return critic_loss_element_wise def update_model(self) -> Tuple[torch.Tensor, torch.Tensor]: """Train the model after each episode.""" experiences_1 = self.memory.sample(self.beta) states, actions = experiences_1[:2] weights, indices, eps_d = experiences_1[-3:] gamma = self.hyper_params["GAMMA"] # train critic gradient_clip_cr = self.hyper_params["GRADIENT_CLIP_CR"] critic_loss_element_wise = self._get_critic_loss(experiences_1, gamma) critic_loss = torch.mean(critic_loss_element_wise * weights) if self.use_n_step: experiences_n = self.memory_n.sample(indices) gamma = gamma ** self.hyper_params["N_STEP"] critic_loss_n_element_wise = self._get_critic_loss(experiences_n, gamma) # to update loss and priorities lambda1 = self.hyper_params["LAMBDA1"] critic_loss_element_wise += critic_loss_n_element_wise * lambda1 critic_loss = torch.mean(critic_loss_element_wise * weights) self.critic_optimizer.zero_grad() critic_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), gradient_clip_cr) self.critic_optimizer.step() # train actor gradient_clip_ac = self.hyper_params["GRADIENT_CLIP_AC"] actions = self.actor(states) actor_loss_element_wise = -self.critic(torch.cat((states, actions), dim=-1)) actor_loss = torch.mean(actor_loss_element_wise * weights) self.actor_optimizer.zero_grad() actor_loss.backward() nn.utils.clip_grad_norm_(self.actor.parameters(), gradient_clip_ac) self.actor_optimizer.step() # update target networks tau = self.hyper_params["TAU"] common_utils.soft_update(self.actor, self.actor_target, tau) common_utils.soft_update(self.critic, self.critic_target, tau) # update priorities new_priorities = critic_loss_element_wise new_priorities += self.hyper_params["LAMBDA3"] * actor_loss_element_wise.pow(2) new_priorities += self.hyper_params["PER_EPS"] new_priorities = new_priorities.data.cpu().numpy().squeeze() new_priorities += eps_d self.memory.update_priorities(indices, new_priorities) # increase beta fraction = min(float(self.i_episode) / self.args.episode_num, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) return actor_loss.item(), critic_loss.item() def pretrain(self): """Pretraining steps.""" pretrain_loss = list() print("[INFO] Pre-Train %d step." % self.hyper_params["PRETRAIN_STEP"]) for i_step in range(1, self.hyper_params["PRETRAIN_STEP"] + 1): t_begin = time.time() loss = self.update_model() t_end = time.time() pretrain_loss.append(loss) # for logging # logging if i_step == 1 or i_step % 100 == 0: avg_loss = np.vstack(pretrain_loss).mean(axis=0) pretrain_loss.clear() self.write_log(0, avg_loss, 0, t_end - t_begin) print("[INFO] Pre-Train Complete!\n")
class SACfDAgent(SACAgent): """SAC agent interacting with environment. Attrtibutes: memory (PrioritizedReplayBuffer): replay memory beta (float): beta parameter for prioritized replay buffer """ # pylint: disable=attribute-defined-outside-init def _initialize(self): """Initialize non-common things.""" self.use_n_step = self.hyper_params["N_STEP"] > 1 if not self.args.test: # load demo replay memory with open(self.args.demo_path, "rb") as f: demos = pickle.load(f) if self.use_n_step: demos, demos_n_step = common_utils.get_n_step_info_from_demo( demos, self.hyper_params["N_STEP"], self.hyper_params["GAMMA"]) # replay memory for multi-steps self.memory_n = ReplayBuffer( buffer_size=self.hyper_params["BUFFER_SIZE"], n_step=self.hyper_params["N_STEP"], gamma=self.hyper_params["GAMMA"], demo=demos_n_step, ) # replay memory self.beta = self.hyper_params["PER_BETA"] self.memory = PrioritizedReplayBuffer( self.hyper_params["BUFFER_SIZE"], self.hyper_params["BATCH_SIZE"], demo=demos, alpha=self.hyper_params["PER_ALPHA"], epsilon_d=self.hyper_params["PER_EPS_DEMO"], ) def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]): """Add 1 step and n step transitions to memory.""" # add n-step transition if self.use_n_step: transition = self.memory_n.add(transition) # add a single step transition # if transition is not an empty tuple if transition: self.memory.add(transition) # pylint: disable=too-many-statements def update_model(self) -> Tuple[torch.Tensor, ...]: """Train the model after each episode.""" self.update_step += 1 experiences = self.memory.sample(self.beta) states, actions, rewards, next_states, dones, weights, indices, eps_d = ( experiences) new_actions, log_prob, pre_tanh_value, mu, std = self.actor(states) # train alpha if self.hyper_params["AUTO_ENTROPY_TUNING"]: alpha_loss = torch.mean( (-self.log_alpha * (log_prob + self.target_entropy).detach()) * weights) self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() alpha = self.log_alpha.exp() else: alpha_loss = torch.zeros(1) alpha = self.hyper_params["W_ENTROPY"] # Q function loss masks = 1 - dones gamma = self.hyper_params["GAMMA"] q_1_pred = self.qf_1(states, actions) q_2_pred = self.qf_2(states, actions) v_target = self.vf_target(next_states) q_target = rewards + self.hyper_params["GAMMA"] * v_target * masks qf_1_loss = torch.mean((q_1_pred - q_target.detach()).pow(2) * weights) qf_2_loss = torch.mean((q_2_pred - q_target.detach()).pow(2) * weights) if self.use_n_step: experiences_n = self.memory_n.sample(indices) _, _, rewards, next_states, dones = experiences_n gamma = gamma**self.hyper_params["N_STEP"] lambda1 = self.hyper_params["LAMBDA1"] masks = 1 - dones v_target = self.vf_target(next_states) q_target = rewards + gamma * v_target * masks qf_1_loss_n = torch.mean( (q_1_pred - q_target.detach()).pow(2) * weights) qf_2_loss_n = torch.mean( (q_2_pred - q_target.detach()).pow(2) * weights) # to update loss and priorities qf_1_loss = qf_1_loss + qf_1_loss_n * lambda1 qf_2_loss = qf_2_loss + qf_2_loss_n * lambda1 # V function loss v_pred = self.vf(states) q_pred = torch.min(self.qf_1(states, new_actions), self.qf_2(states, new_actions)) v_target = (q_pred - alpha * log_prob).detach() vf_loss_element_wise = (v_pred - v_target).pow(2) vf_loss = torch.mean(vf_loss_element_wise * weights) # train Q functions self.qf_1_optimizer.zero_grad() qf_1_loss.backward() self.qf_1_optimizer.step() self.qf_2_optimizer.zero_grad() qf_2_loss.backward() self.qf_2_optimizer.step() # train V function self.vf_optimizer.zero_grad() vf_loss.backward() self.vf_optimizer.step() if self.update_step % self.hyper_params["POLICY_UPDATE_FREQ"] == 0: # actor loss advantage = q_pred - v_pred.detach() actor_loss_element_wise = alpha * log_prob - advantage actor_loss = torch.mean(actor_loss_element_wise * weights) # regularization if not self.is_discrete: # iff the action is continuous mean_reg = self.hyper_params["W_MEAN_REG"] * mu.pow(2).mean() std_reg = self.hyper_params["W_STD_REG"] * std.pow(2).mean() pre_activation_reg = self.hyper_params[ "W_PRE_ACTIVATION_REG"] * (pre_tanh_value.pow(2).sum( dim=-1).mean()) actor_reg = mean_reg + std_reg + pre_activation_reg # actor loss + regularization actor_loss += actor_reg # train actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target networks common_utils.soft_update(self.vf, self.vf_target, self.hyper_params["TAU"]) # update priorities new_priorities = vf_loss_element_wise new_priorities += self.hyper_params[ "LAMBDA3"] * actor_loss_element_wise.pow(2) new_priorities += self.hyper_params["PER_EPS"] new_priorities = new_priorities.data.cpu().numpy().squeeze() new_priorities += eps_d self.memory.update_priorities(indices, new_priorities) # increase beta fraction = min(float(self.i_episode) / self.args.episode_num, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) else: actor_loss = torch.zeros(1) return ( actor_loss.item(), qf_1_loss.item(), qf_2_loss.item(), vf_loss.item(), alpha_loss.item(), ) def pretrain(self): """Pretraining steps.""" pretrain_loss = list() print("[INFO] Pre-Train %d steps." % self.hyper_params["PRETRAIN_STEP"]) for i_step in range(1, self.hyper_params["PRETRAIN_STEP"] + 1): t_begin = time.time() loss = self.update_model() t_end = time.time() pretrain_loss.append(loss) # for logging # logging if i_step == 1 or i_step % 100 == 0: avg_loss = np.vstack(pretrain_loss).mean(axis=0) pretrain_loss.clear() self.write_log( 0, avg_loss, 0, policy_update_freq=self.hyper_params["POLICY_UPDATE_FREQ"], avg_time_cost=t_end - t_begin, ) print("[INFO] Pre-Train Complete!\n")
class PERDDPGAgent(DDPGAgent): """ActorCritic interacting with environment. Attributes: memory (PrioritizedReplayBuffer): replay memory beta (float): beta parameter for prioritized replay buffer """ # pylint: disable=attribute-defined-outside-init def _initialize(self): """Initialize non-common things.""" if not self.args.test: # replay memory self.beta = self.hyper_params["PER_BETA"] self.memory = PrioritizedReplayBuffer( self.hyper_params["BUFFER_SIZE"], self.hyper_params["BATCH_SIZE"], alpha=self.hyper_params["PER_ALPHA"], ) def update_model(self) -> Tuple[torch.Tensor, ...]: """Train the model after each episode.""" experiences = self.memory.sample(self.beta) states, actions, rewards, next_states, dones, weights, indexes = experiences # G_t = r + gamma * v(s_{t+1}) if state != Terminal # = r otherwise masks = 1 - dones next_actions = self.actor_target(next_states) next_values = self.critic_target( torch.cat((next_states, next_actions), dim=-1)) curr_returns = rewards + self.hyper_params[ "GAMMA"] * next_values * masks curr_returns = curr_returns.to(device).detach() # train critic gradient_clip_cr = self.hyper_params["GRADIENT_CLIP_CR"] values = self.critic(torch.cat((states, actions), dim=-1)) critic_loss_element_wise = (values - curr_returns).pow(2) critic_loss = torch.mean(critic_loss_element_wise * weights) self.critic_optimizer.zero_grad() critic_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), gradient_clip_cr) self.critic_optimizer.step() # train actor gradient_clip_ac = self.hyper_params["GRADIENT_CLIP_AC"] actions = self.actor(states) actor_loss_element_wise = -self.critic( torch.cat((states, actions), dim=-1)) actor_loss = torch.mean(actor_loss_element_wise * weights) self.actor_optimizer.zero_grad() actor_loss.backward() nn.utils.clip_grad_norm_(self.actor.parameters(), gradient_clip_ac) self.actor_optimizer.step() # update target networks tau = self.hyper_params["TAU"] common_utils.soft_update(self.actor, self.actor_target, tau) common_utils.soft_update(self.critic, self.critic_target, tau) # update priorities in PER new_priorities = critic_loss_element_wise new_priorities = (new_priorities.data.cpu().numpy() + self.hyper_params["PER_EPS"]) self.memory.update_priorities(indexes, new_priorities) # increase beta fraction = min(float(self.i_episode) / self.args.episode_num, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) return actor_loss.item(), critic_loss.item()