def test_sample(self): buffer_size = 500 obs_shape = (84, 84, 3) act_dim = 4 rb = PrioritizedReplayBuffer(buffer_size, { "obs": { "shape": obs_shape }, "act": { "shape": act_dim }, "rew": {}, "done": {} }, next_of="obs") obs = np.zeros(obs_shape) act = np.ones(act_dim) rew = 1 done = 0 rb.add(obs=obs, act=act, rew=rew, next_obs=obs, done=done) ps = 1.5 rb.add(obs=obs, act=act, rew=rew, next_obs=obs, done=done, priorities=ps) self.assertAlmostEqual(rb.get_max_priority(), 1.5) obs = np.stack((obs, obs)) act = np.stack((act, act)) rew = (1, 0) done = (0.0, 1.0) rb.add(obs=obs, act=act, rew=rew, next_obs=obs, done=done) ps = (0.2, 0.4) rb.add(obs=obs, act=act, rew=rew, next_obs=obs, done=done, priorities=ps) sample = rb.sample(64) w = sample["weights"] i = sample["indexes"] rb.update_priorities(i, w * w)
def test_read_only_priority(self): buffer_size = 100 batch_size = 32 env_dict = {"done": {}} done = np.zeros(2) ps = np.ones_like(done) ps.setflags(write=False) rb = PrioritizedReplayBuffer(buffer_size, env_dict) rb.add(done=done, priority=ps) sample = rb.sample(batch_size) ps2 = sample["weights"] ps2.setflags(write=False) rb.update_priorities(sample["indexes"], ps2)
class RainbowAgent: """Agent interacting with environment. Attribute: env (gym.Env): openAI Gym environment memory (PrioritizedReplayBuffer): replay memory to store transitions batch_size (int): batch size for sampling target_update (int): period for target model's hard update gamma (float): discount factor dqn (Network): model to train and select actions dqn_target (Network): target model to update optimizer (torch.optim): optimizer for training dqn transition (list): transition information including state, action, reward, next_state, done v_min (float): min value of support v_max (float): max value of support atom_size (int): the unit number of support support (torch.Tensor): support for categorical dqn use_n_step (bool): whether to use n_step memory n_step (int): step number to calculate n-step td error memory_n (ReplayBuffer): n-step replay buffer """ def __init__( self, env: gym.Env, memory_size: int, batch_size: int, target_update: int, gamma: float = 0.99, # PER parameters alpha: float = 0.2, beta: float = 0.6, prior_eps: float = 1e-6, # Categorical DQN parameters v_min: float = 0.0, v_max: float = 200.0, atom_size: int = 51, # N-step Learning n_step: int = 3, # Convergence parameters convergence_window: int = 100, convergence_window_epsilon_p: int = 10, convergence_avg_score: float = 195.0, convergence_avg_epsilon: float = 0.0524, # 3 degs converted to rads convergence_avg_epsilon_p: float = 0.0174, # 1 deg/s converted to rad/s # Tensorboard parameters model_name: str = "snake_joint", ): """Initialization. Args: env (gym.Env): openAI Gym environment memory_size (int): length of memory batch_size (int): batch size for sampling target_update (int): period for target model's hard update lr (float): learning rate gamma (float): discount factor alpha (float): determines how much prioritization is used beta (float): determines how much importance sampling is used prior_eps (float): guarantees every transition can be sampled v_min (float): min value of support v_max (float): max value of support atom_size (int): the unit number of support n_step (int): step number to calculate n-step td error """ obs_dim = env.observation_space.shape[0] action_dim = env.action_space.n self.env = env self.batch_size = batch_size self.target_update = target_update self.gamma = gamma # NoisyNet: All attributes related to epsilon are removed #produces a unique timestamp for each run run_timestamp=str( #returns number of day and number of month str(time.localtime(time.time())[2]) + "_" + str(time.localtime(time.time())[1]) + "_" + #returns hour, minute and second str(time.localtime(time.time())[3]) + "_" + str(time.localtime(time.time())[4]) + "_" + str(time.localtime(time.time())[5]) ) #Will write scalars that can be visualized using tensorboard in the directory "runLogs/timestamp" self.writer = SummaryWriter("runLogs/" + run_timestamp) # device: cpu / gpu self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) print(self.device) # PER # memory for 1-step Learning self.beta = beta self.prior_eps = prior_eps self.memory = PrioritizedReplayBuffer( memory_size, { "obs": {"shape": (obs_dim,)}, "act": {"shape": (1,)}, "rew": {}, "next_obs": {"shape": (obs_dim,)}, "done": {} }, alpha=alpha ) # memory for N-step Learning self.use_n_step = True if n_step > 1 else False if self.use_n_step: self.n_step = n_step self.memory_n = ReplayBuffer( memory_size, { "obs": {"shape": (obs_dim,)}, "act": {"shape": (1,)}, "rew": {}, "next_obs": {"shape": (obs_dim,)}, "done": {} }, Nstep={ "size": n_step, "gamma": gamma, "rew": "rew", "next": "next_obs" } ) # Categorical DQN parameters self.v_min = v_min self.v_max = v_max self.atom_size = atom_size self.support = torch.linspace( self.v_min, self.v_max, self.atom_size ).to(self.device) # networks: dqn, dqn_target self.dqn = Network( obs_dim, action_dim, self.atom_size, self.support ).to(self.device) self.dqn_target = Network( obs_dim, action_dim, self.atom_size, self.support ).to(self.device) self.dqn_target.load_state_dict(self.dqn.state_dict()) self.dqn_target.eval() # optimizer self.optimizer = optim.Adam(self.dqn.parameters(),0.0001) # transition to store in memory self.transition = list() # mode: train / test self.is_test = False # Custom tensorboard object # self.tensorboard = RainbowTensorBoard( # log_dir="single_joint_logs/{}-{}".format( # model_name, # datetime.now().strftime("%m-%d-%Y-%H_%M_%S") # ) # ) # Convergence criterion self.convergence_window = convergence_window self.convergence_window_epsilon_p = convergence_window_epsilon_p self.convergence_avg_score = convergence_avg_score self.convergence_avg_epsilon = convergence_avg_epsilon self.convergence_avg_epsilon_p = convergence_avg_epsilon_p def select_action(self, state: np.ndarray) -> np.ndarray: """Select an action from the input state.""" # NoisyNet: no epsilon greedy action selection selected_action = self.dqn( torch.FloatTensor(state).to(self.device) ).argmax() selected_action = selected_action.detach().cpu().numpy() if not self.is_test: self.transition = [state, selected_action] return selected_action def step(self, action: np.ndarray, score:int) -> Tuple[np.ndarray, np.float64, bool]: """Take an action and return the response of the env.""" next_state, reward, done, _ = self.env.step(action,score) if not self.is_test: self.transition += [reward, next_state, done] # N-step transition if self.use_n_step: idx = self.memory_n.add( **dict( zip(["obs", "act", "rew", "next_obs", "done"], self.transition) ) ) one_step_transition = [ v[idx] for _,v in self.memory_n.get_all_transitions().items()] if idx else None # 1-step transition else: one_step_transition = self.transition # add a single step transition if one_step_transition: self.memory.add( **dict( zip(["obs", "act", "rew", "next_obs", "done"], one_step_transition) ) ) return next_state, reward, done def update_model(self,frame_idx:int) -> torch.Tensor: """Update the model by gradient descent. shape of elementwise_loss = [128,51] shape of loss = ([]) shape of weights ([128,1)] """ # PER needs beta to calculate weights samples = self.memory.sample(self.batch_size, beta=self.beta) weights = torch.FloatTensor( samples["weights"].reshape(-1, 1) ).to(self.device) indices = samples["indexes"] #rospy.loginfo(samples.keys()) #rospy.loginfo(weights.shape) #rospy.loginfo(indices.shape()) #torch.save(self.dqn.state_dict(),str("checkpoint_"+str(time.time()))) # 1-step Learning loss elementwise_loss = self._compute_dqn_loss(samples, self.gamma) # PER: importance sampling before average loss = torch.mean(elementwise_loss * weights) self.writer.add_scalar('update_model/Lossv0', loss.detach().item(),frame_idx ) # N-step Learning loss # we are gonna combine 1-step loss and n-step loss so as to # prevent high-variance. The original rainbow employs n-step loss only. if self.use_n_step: gamma = self.gamma ** self.n_step samples = {k: [v[i] for i in indices] for k,v in self.memory_n.get_all_transitions().items()} elementwise_loss_n_loss = self._compute_dqn_loss(samples, gamma) elementwise_loss += elementwise_loss_n_loss #rospy.loginfo(elementwise_loss_n_loss.shape) #rospy.loginfo(elementwise_loss.shape) # PER: importance sampling before average loss = torch.mean(elementwise_loss * weights) rospy.loginfo( f"{elementwise_loss}" ) self.optimizer.zero_grad() self.writer.add_scalar('update_model/Lossv1', loss.detach().item(),frame_idx ) #From pytorch doc: backward() Computes the gradient of current tensor w.r.t. graph leaves. #self.writer.add_image("loss gradient before", loss, frame_idx) loss.backward() #self.writer.add_image("loss gradient after", loss, frame_idx) self.writer.add_scalar('update_model/Lossv2', loss.detach().item(),frame_idx ) clip_grad_norm_(self.dqn.parameters(), 10.0) self.optimizer.step() # PER: update priorities loss_for_prior = elementwise_loss.detach().cpu().numpy() new_priorities = loss_for_prior + self.prior_eps self.memory.update_priorities(indices, new_priorities) # NoisyNet: reset noise self.dqn.reset_noise() self.dqn_target.reset_noise() #rospy.loginfo("second") #rospy.loginfo(loss.shape) #rospy.loginfo("loss dimension = " + loss.ndim() ) #rospy.loginfo("loss = " + str(loss.detach().item()) + "type = " + str(type(loss.detach().item()) ) ) self.writer.add_scalar('update_model/Loss', loss.detach().item(),frame_idx ) return loss.detach().item() def train(self, num_frames: int): """Train the agent.""" self.is_test = False state = self.env.reset() update_cnt = 0 losses = [] scores = [] score = 0 for frame_idx in tqdm(range(1, num_frames + 1)): action = self.select_action(state) next_state, reward, done = self.step(action,score) state = next_state score += reward # NoisyNet: removed decrease of epsilon # PER: increase beta fraction = min(frame_idx / num_frames, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) # if episode ends if done: #rospy.loginfo("logging for done") self.writer.add_scalar('train/score', score, frame_idx) self.writer.add_scalar('train/final_epsilon', state[6], frame_idx) self.writer.add_scalar('train/epsilon_p', state[7], frame_idx) state = self.env.reset() scores.append(score) score = 0 # if training is ready if self.memory.get_stored_size() >= self.batch_size: #frame_id given as argument for logging by self.writer. #rospy.loginfo("frame_idx= " + str(frame_idx) + "type = " + str(type(frame_idx))) loss = self.update_model(frame_idx) losses.append(loss) update_cnt += 1 # if hard update is needed if update_cnt % self.target_update == 0: self._target_hard_update(loss) self.env.close() def test(self) -> List[np.ndarray]: """Test the agent.""" self.is_test = True state = self.env.reset() done = False score = 0 frames = [] while not done: frames.append(self.env.render(mode="rgb_array")) action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward print("score: ", score) self.env.close() return frames def _compute_dqn_loss(self, samples: Dict[str, np.ndarray], gamma: float) -> torch.Tensor: """Return categorical dqn loss.""" device = self.device # for shortening the following lines state = torch.FloatTensor(samples["obs"]).to(device) next_state = torch.FloatTensor(samples["next_obs"]).to(device) action = torch.LongTensor(samples["act"]).to(device) reward = torch.FloatTensor(np.array(samples["rew"]).reshape(-1, 1)).to(device) done = torch.FloatTensor(np.array(samples["done"]).reshape(-1, 1)).to(device) # Categorical DQN algorithm delta_z = float(self.v_max - self.v_min) / (self.atom_size - 1) with torch.no_grad(): # Double DQN next_action = self.dqn(next_state).argmax(1) next_dist = self.dqn_target.dist(next_state) next_dist = next_dist[range(self.batch_size), next_action] t_z = reward + (1 - done) * gamma * self.support t_z = t_z.clamp(min=self.v_min, max=self.v_max) b = (t_z - self.v_min) / delta_z l = b.floor().long() u = b.ceil().long() offset = ( torch.linspace( 0, (self.batch_size - 1) * self.atom_size, self.batch_size ).long() .unsqueeze(1) .expand(self.batch_size, self.atom_size) .to(self.device) ) proj_dist = torch.zeros(next_dist.size(), device=self.device) proj_dist.view(-1).index_add_( 0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1) ) proj_dist.view(-1).index_add_( 0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1) ) print(f"Next Action : {next_action}\n Next Dist : {next_dist}\n") dist = self.dqn.dist(state) log_p = torch.log(dist[range(self.batch_size), action]) elementwise_loss = -(proj_dist * log_p).sum(1) print(f"Proj Dist : {proj_dist}\n Dist : {dist}\n Log_p : {log_p}\n") if torch.isnan(elementwise_loss[0][0]): exit() return elementwise_loss def _target_hard_update(self,loss): """Hard update: target <- local.""" self.dqn_target.load_state_dict(self.dqn.state_dict()) #torch.save(self.dqn.state_dict(),str("checkpoint_"+str(time.time()))) torch.save({ 'model_state_dict': self.dqn.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'loss': loss, }, str("checkpoints/checkpoint_"+str(time.time())))
tf.constant(sample["rew"].ravel()), tf.constant(sample["done"].ravel()), discount, tf.constant(env.action_space.n)) absTD = tf.math.abs(target_Q - Q) loss = tf.reduce_mean(loss_func(absTD) * weights) grad = tape.gradient(loss, model.trainable_weights) optimizer.apply_gradients(zip(grad, model.trainable_weights)) tf.summary.scalar("Loss vs training step", data=loss, step=n_step) if prioritized: Q = Q_func(model, tf.constant(sample["obs"]), tf.constant(sample["act"].ravel()), tf.constant(env.action_space.n)) absTD = tf.math.abs(target_Q - Q) rb.update_priorities(sample["indexes"], absTD) if done: observation = env.reset() rb.on_episode_end() n_episode += 1 if n_step % target_update_freq == 0: target_model.set_weights(model.get_weights()) if n_step % eval_freq == eval_freq - 1: eval_rew = evaluate(model, eval_env) tf.summary.scalar("episode reward vs training step", data=eval_rew, step=n_step)
class Agent: def __init__(self, lr, state_shape, num_actions, batch_size, max_mem_size=100000): self.lr = lr self.gamma = 0.99 self.action_space = list(range(num_actions)) self.batch_size = batch_size self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000) self.importance_exp = Lerper(start=0.4, end=1.0, num_steps=100000) self.priority_exp = 0.6 self.memory = PrioritizedReplayBuffer(max_mem_size, { "obs": { "shape": state_shape }, "act": { "shape": 1 }, "rew": {}, "next_obs": { "shape": state_shape }, "done": { "shape": 1 } }, alpha=self.priority_exp) self.net = Network(lr, state_shape, num_actions) def choose_action(self, observation): if np.random.random() > self.epsilon.value(): state = torch.tensor(observation).float().detach() state = state.to(self.net.device) state = state.unsqueeze(0) q_values = self.net(state) action = torch.argmax(q_values).item() return action else: return np.random.choice(self.action_space) def store_memory(self, state, action, reward, next_state, done): self.memory.add(obs=state, act=action, rew=reward, next_obs=next_state, done=done) def learn(self): if self.memory.get_stored_size() < self.batch_size: return batch = self.memory.sample(self.batch_size, self.importance_exp.value()) states = torch.tensor(batch["obs"]).to(self.net.device) actions = torch.tensor(batch["act"], dtype=torch.int64).to(self.net.device).T[0] rewards = torch.tensor(batch["rew"]).to(self.net.device).T[0] states_ = torch.tensor(batch["next_obs"]).to(self.net.device) dones = torch.tensor(batch["done"], dtype=torch.bool).to(self.net.device).T[0] weights = torch.tensor(batch["weights"]).to(self.net.device) batch_index = np.arange(self.batch_size, dtype=np.int64) q_values = self.net(states)[batch_index, actions] q_values_ = self.net(states_) action_qs_ = torch.max(q_values_, dim=1)[0] action_qs_[dones] = 0.0 q_target = rewards + self.gamma * action_qs_ td = q_target - q_values self.net.optimizer.zero_grad() loss = ((td**2.0) * weights).mean() loss.backward() self.net.optimizer.step() new_priorities = (td.abs()).detach().cpu() self.memory.update_priorities(batch["indexes"], new_priorities) self.epsilon.step() self.importance_exp.step()
break if frame < params.init_replay: continue if frame % args.envs == 0: if args.priority: batch = buffer.sample(params.batch_size, BETA) BETA = min(BETA + frame * step, TGT_BETA) else: batch = buffer.sample(params.batch_size) optimizer.zero_grad() if args.priority: loss_v, batch_prios, batch_indexes = calc_loss_dqn(batch, net, tgt_net, params.gamma, device, True) else: loss_v = calc_loss_dqn( batch, net, tgt_net, params.gamma**args.steps, device=device) loss_v.backward() optimizer.step() if args.priority: buffer.update_priorities(batch_indexes, batch_prios) # del batch, loss if mean and selector.epsilon <= params.eps_final: scheduler.step(round(mean)*2/2) if frame % params.sync_nets == 0: tgt_net.sync()
class ReplayMemory(): def __init__(self, args, capacity, env): # Initial importance sampling weight β, annealed to 1 over course of training self.priority_weight = args.priority_weight self.n = args.multi_step self.device = args.device if args.mmap: os.makedirs('memories/', exist_ok=True) mmap_prefix = 'memories/mm' else: mmap_prefix = None self.buffer = PrioritizedReplayBuffer( capacity, { "obs": { "shape": env.observation_space.shape, "dtype": env.observation_space.dtype }, "next_obs": { "shape": env.observation_space.shape, "dtype": env.observation_space.dtype }, "act": { "shape": 1, "dtype": env.action_space.dtype }, "rew": { "dtype": np.float32 }, "done": { "dtype": np.uint8 }, }, Nstep={ "size": self.n, "gamma": args.discount, "rew": "rew", "next": "next_obs", }, mmap_prefix=mmap_prefix, alpha=args.priority_exponent, # next_of="obs", # stack_compress="obs", ) def append(self, state, next_state, action, reward, done): self.buffer.add( **{ "obs": state, "next_obs": next_state, "act": action, "rew": reward, "done": done, }) def sample(self, size): s = self.buffer.sample(size, self.priority_weight) s['indexes'] = s['indexes'].astype(np.int32) return torchify((s['indexes'], torch.int32), (s['obs'], torch.float32), (np.squeeze(s['act'], 1), torch.long), (np.squeeze(s['rew'], 1), torch.float32), (s['next_obs'], torch.float32), (s['done'], torch.bool), (s['weights'], torch.float32), device=self.device) def update_priorities(self, indexes, new_priorities): indexes = indexes.cpu().numpy() self.buffer.update_priorities(indexes, new_priorities)
class RainbowAgent: """ Rainbow Agent interacting with environment. Attribute: env (gym.Env): openAI Gym environment (connected to Gazebo node) memory (PrioritizedReplayBuffer): replay memory to store transitions batch_size (int): batch size for sampling target_update (int): period for target model's hard update gamma (float): discount factor dqn (Network): model to train and select actions dqn_target (Network): target model to update optimizer (torch.optim): optimizer for training dqn transition (list): transition information including state, action, reward, next_state, done v_min (float): min value of support v_max (float): max value of support atom_size (int): the unit number of support support (torch.Tensor): support for categorical dqn use_n_step (bool): whether to use n_step memory n_step (int): step number to calculate n-step td error memory_n (ReplayBuffer): n-step replay buffer """ def __init__( self, env: gym.Env, memory_size: int, batch_size: int, target_update: int, gamma: float = 0.99, # PER parameters alpha: float = 0.2, beta: float = 0.6, prior_eps: float = 1e-6, # Categorical DQN parameters v_min: float = 0.0, v_max: float = 200.0, atom_size: int = 51, # N-step Learning n_step: int = 3, # Convergence parameters convergence_window: int = 100, convergence_window_epsilon_p: int = 10, convergence_avg_score: float = 195.0, convergence_avg_epsilon: float = 0.0524, # 3 degs converted to rads convergence_avg_epsilon_p: float = 0.0174, # 1 deg/s converted to rad/s # Tensorboard parameters model_name: str = "snake_joint", ): """ Initialization. Args: env_client (GymEnvClient): ROS client to an openAI Gym environment server memory_size (int): length of memory batch_size (int): batch size for sampling target_update (int): period for target model's hard update lr (float): learning rate gamma (float): discount factor alpha (float): determines how much prioritization is used beta (float): determines how much importance sampling is used prior_eps (float): guarantees every transition can be sampled v_min (float): min value of support v_max (float): max value of support atom_size (int): the unit number of support n_step (int): step number to calculate n-step td error """ obs_dim = env.observation_space.shape[0] action_dim = env.action_space.n self.env = env self.batch_size = batch_size self.target_update = target_update self.gamma = gamma # Selecting computing device physical_devices = tf.config.list_physical_devices('GPU') n_gpu = len(physical_devices) rospy.loginfo("Number of GPU detected : " + str(n_gpu)) if n_gpu > 0: rospy.loginfo("Switching to single GPU mode : /device:GPU:0") self.used_device = "/device:GPU:0" tf.config.experimental.set_memory_growth(physical_devices[0], True) else: rospy.loginfo("No GPU detected. Switching to single CPU mode : /device:CPU:0") self.used_device = "/device:CPU:0" # PER # memory for 1-step learning self.beta = beta self.prior_eps = prior_eps self.memory = PrioritizedReplayBuffer( memory_size, { "obs": {"shape": (obs_dim,)}, "act": {"shape": (1,)}, "rew": {}, "next_obs": {"shape": (obs_dim,)}, "done": {} }, alpha=alpha ) # memory for N-step learning self.use_n_step = True if n_step > 1 else False if self.use_n_step: self.n_step = n_step self.memory_n = ReplayBuffer( memory_size, { "obs": {"shape": (obs_dim,)}, "act": {"shape": (1,)}, "rew": {}, "next_obs": {"shape": (obs_dim,)}, "done": {} }, Nstep={ "size": n_step, "gamma": gamma, "rew": "rew", "next": "next_obs" } ) # Categorical DQN parameters self.v_min = v_min self.v_max = v_max self.atom_size = atom_size self.support = tf.linspace(self.v_min, self.v_max, self.atom_size, name="support") # networks: dqn, dqn_target self.dqn = Network( obs_dim, action_dim, self.atom_size, self.support, name="dqn" ) self.dqn_target = Network( obs_dim, action_dim, self.atom_size, self.support, name="dqn_target" ) # optimizer self.optimizer = Adam( learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, name='AdamOptimizer' ) # transition to store in memory self.transition = list() # mode: train / test self.is_test = False # Custom tensorboard object self.tensorboard = RainbowTensorBoard( log_dir="single_joint_logs/{}-{}".format( model_name, datetime.now().strftime("%m-%d-%Y-%H_%M_%S") ) ) # Convergence criterion self.convergence_window = convergence_window self.convergence_window_epsilon_p = convergence_window_epsilon_p self.convergence_avg_score = convergence_avg_score self.convergence_avg_epsilon = convergence_avg_epsilon self.convergence_avg_epsilon_p = convergence_avg_epsilon_p #TODO # model checkpoint object self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.dqn_target) self.checkpoint_manager = tf.train.CheckpointManager( self.checkpoint, directory="single_joint_ckpts", max_to_keep=5 ) def select_action(self, state: np.ndarray) -> np.ndarray: """Select an action from the input state.""" # NoisyNet: no epsilon greedy action selection selected_action = tf.math.argmax(self.dqn( tf.constant(state.reshape(1, state.shape[0]), dtype=tf.float32) ), axis=-1, name="argmax_selected_action") # Convert to numpy ndarray datatype selected_action = selected_action.numpy() if not self.is_test: self.transition = [state, selected_action] return selected_action def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool]: """ Take an action and return the response of the env. """ next_state, reward, done, _ = self.env.step(action,score) if not self.is_test: self.transition += [reward, next_state, done] # N-step transition if self.use_n_step: idx = self.memory_n.add( **dict( zip(["obs", "act", "rew", "next_obs", "done"], self.transition) ) ) one_step_transition = [ v[idx] for _,v in self.memory_n.get_all_transitions().items()] if idx else None # 1-step transition else: one_step_transition = self.transition # add a single step transition if one_step_transition: self.memory.add( **dict( zip(["obs", "act", "rew", "next_obs", "done"], one_step_transition) ) ) return next_state, reward, done def update_model(self) -> tf.Tensor: """ Update the model by gradient descent """ # PER needs beta to calculate weights samples = self.memory.sample(self.batch_size, beta=self.beta) weights = tf.constant( samples["weights"].reshape(-1, 1), dtype=tf.float32, name="update_model_weights" ) indices = samples["indexes"] # 1-step Learning loss elementwise_loss = self._compute_dqn_loss(samples, self.gamma) with tf.GradientTape() as tape: # PER: importance of sampling before average loss = tf.math.reduce_mean(elementwise_loss * weights) # N-step Learning loss # We are going to combine 1-ste[ loss and n-step loss so as to # prevent high-variance. if self.use_n_step: gamma = self.gamma ** self.n_step samples = {k: [v[i] for i in indices] for k,v in self.memory_n.get_all_transitions().items()} elementwise_loss_n_loss = self._compute_dqn_loss(samples, gamma) elementwise_loss += elementwise_loss_n_loss # PER: importance of sampling before average loss = tf.math.reduce_mean(elementwise_loss * weights) dqn_variables = self.dqn.trainable_variables gradients = tape.gradient(loss, dqn_variables) gradients, _ = tf.clip_by_global_norm(gradients, 10.0) self.optimizer.apply_gradients(zip(gradients, dqn_variables)) # PER: update priorities loss_for_prior = elementwise_loss.numpy() new_priorities = loss_for_prior + self.prior_eps self.memory.update_priorities(indices, new_priorities) # NoisyNet: reset noise self.dqn.reset_noise() self.dqn_target.reset_noise() return loss.numpy().ravel() def train(self, num_frames: int): """Train the agent.""" self.is_test = False state = self.env.reset() update_cnt = 0 scores = deque(maxlen=self.convergence_window) joint_epsilon = deque(maxlen=self.convergence_window) joint_epsilon_p = deque(maxlen=self.convergence_window_epsilon_p) score = 0 # cumulated reward episode_length = 0 episode_cnt = 0 for frame_idx in tqdm(range(1, num_frames + 1), file=tqdm_out): action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward episode_length += 1 # PER: increase beta fraction = min(frame_idx / num_frames, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) print("epsilon_p is {}".format(state[7])) print("epsilon is {}".format(state[6])) if done: print("done") # to be used for convergence criterion scores.append(score) joint_epsilon.append(state[6]) joint_epsilon_p.append(state[7]) # state = self.env.reset() self.tensorboard.update_stats( score={ "data": score, "desc": "Score (or cumulated rewards) for an episode - episode index on x-axis." }, episode_length={ "data": episode_length, "desc": "Episode length (in frames)" }, final_epsilon={ "data": state[6], "desc": "Value of epsilon = abs(theta_ld - theta_l) at the last frame of an episode" }, final_epsilon_p={ "data": state[7], "desc": "Value of d(epsilon)/dt at the last frame of an episode" } ) score = 0 episode_length = 0 episode_cnt += 1 # check convergence criterion converged = bool( len(scores) == self.convergence_window and # be sure the score buffer is full len(joint_epsilon) == self.convergence_window and # same for epsilon buffer len(joint_epsilon_p) == self.convergence_window and # same for epsilon_p buffer mean(scores) > self.convergence_avg_score and mean(joint_epsilon) < self.convergence_avg_epsilon and mean(joint_epsilon_p) < self.convergence_avg_epsilon_p ) if converged: rospy.loginfo("Ran {} episodes. Solved after {} trials".format(episode_cnt, frame_idx)) return # if training is ready if self.memory.get_stored_size() >= self.batch_size: loss = self.update_model() # plotting loss every frame self.tensorboard.update_stats( loss={ "data": loss[0], "desc": "Loss value." } ) update_cnt += 1 # if hard update is needed if update_cnt % self.target_update == 0: self._target_hard_update() # checkpointing of target model (only if the loss decrease) self.checkpoint_manager.save() self.env.close() def test(self) -> List[np.ndarray]: """Test the agent.""" self.is_test = True state = self.env.reset() done = False score = 0 frames = [] while not done: frames.append(self.env.render(mode="rgb_array")) action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward rospy.loginfo("score: ", score) self.env.close() return frames def _compute_dqn_loss(self, samples: Dict[str, np.ndarray], gamma: float) -> tf.Tensor: with tf.device(self.used_device): state = tf.constant(samples["obs"], dtype=tf.float32) next_state = tf.constant(samples["next_obs"], dtype=tf.float32) action = tf.constant(samples["act"], dtype=tf.float32) reward = tf.reshape(tf.constant(samples["rew"], dtype=tf.float32), [-1, 1]) done = tf.reshape(tf.constant(samples["done"], dtype=tf.float32), [-1, 1]) # Categorical DQN algorithm delta_z = float(self.v_max - self.v_min) / (self.atom_size - 1) # Double DQN next_action = tf.math.argmax(self.dqn(next_state), axis=1) next_dist = self.dqn_target.dist(next_state) next_dist = tf.gather_nd( next_dist, [[i, next_action.numpy()[0]] for i in range(self.batch_size)] ) t_z = reward + (1 - done) * gamma * self.support t_z = tf.clip_by_value(t_z, clip_value_min=self.v_min, clip_value_max=self.v_max) b = tf.dtypes.cast((t_z - self.v_min) / delta_z, tf.float64) l = tf.dtypes.cast(tf.math.floor(b), tf.float64) u = tf.dtypes.cast(tf.math.ceil(b), tf.float64) offset = ( tf.broadcast_to( tf.expand_dims( tf.dtypes.cast( tf.linspace(0, (self.batch_size - 1) * self.atom_size, self.batch_size), tf.float64 ), axis=1 ), [self.batch_size, self.atom_size] ) ) proj_dist = tf.zeros(tf.shape(next_dist), tf.float64) # casting next_dist = tf.dtypes.cast(next_dist, tf.float64) proj_dist = tf.tensor_scatter_nd_add( tf.reshape(proj_dist, [-1]), # input tensor tf.reshape(tf.dtypes.cast(l + offset, tf.int64), [-1, 1]), # indices tf.reshape((next_dist * (u - b)), [-1]) # updates ) proj_dist = tf.tensor_scatter_nd_add( proj_dist, tf.reshape(tf.dtypes.cast(u + offset, tf.int64), [-1, 1]), # indices tf.reshape((next_dist * (b - l)), [-1]) # updates ) proj_dist = tf.reshape(proj_dist, [self.batch_size, self.atom_size]) dist = self.dqn.dist(state) #log_p = tf.math.log(dist[range(self.batch_size), action]) log_p = tf.dtypes.cast( tf.math.log( tf.gather_nd( dist, [[i, tf.dtypes.cast(tf.reshape(action, [-1]), tf.int32).numpy()[i]] for i in range(self.batch_size)] ) ), tf.float64 ) elementwise_loss = tf.math.reduce_sum(-(proj_dist * log_p), axis=1) return tf.dtypes.cast(elementwise_loss, tf.float32) def _target_hard_update(self): """Hard update: target <- local.""" tf.saved_model.save(self.dqn, "single_joint_dqn") self.dqn_target = tf.saved_model.load("single_joint_dqn")
class DQNAgent: def __init__(self): # other hyperparameters self.save_graph = True self.isTraining = True self.keepTraining = False self.play = False self.render = False self.save_model = True self.load_model = False self.random = False self.dueling = True # epsilon greedy exploration self.initial_epsilon = 1.0 self.epsilon = self.initial_epsilon self.min_epsilon = 0.01 self.linear_annealed = (self.initial_epsilon - self.min_epsilon) / 2000 self.decay_rate = 0.995 # check the hyperparameters if self.random: self.play = False self.isTraining = False if self.play: self.render = True self.save_model = False self.load_model = True self.isTraining = False self.keepTraining = False if self.keepTraining: self.epsilon = self.min_epsilon self.load_model = True # fixed q value - two networks self.learning_rate = 0.0001 self.fixed_q_value_steps = 100 self.target_network_counter = 0 # n-step learning self.n_step = 3 self.n_step_buffer = deque(maxlen=self.n_step) # experience replay used SumTree # combine agent and PER self.batch_size = 64 self.gamma = 0.9 self.replay_start_size = 320 self.PER_e = 0.01 # epsilon -> pi = |delta| + epsilon transitions which have zero error also have chance to be selected self.PER_a = 0.6 # P(i) = p(i) ** a / total_priority ** a self.PER_b = 0.4 self.PER_b_increment = 0.005 self.absolute_error_upper = 1. # clipped error self.experience_number = 0 env_dict = { "obs": { "shape": (state_size, ) }, "act": {}, "rew": {}, "next_obs": { "shape": (state_size, ) }, "done": {} } self.experience_replay = PrioritizedReplayBuffer(memory_size, env_dict=env_dict, alpha=self.PER_a, eps=self.PER_e) # initially, p1=1 total_priority=1,so P(1)=1,w1=batchsize**beta if self.load_model: self.model = keras.models.load_model('cartpole_nstep.h5') self.target_model = keras.models.load_model('cartpole_nstep.h5') else: self.model = self.create_model() self.target_model = self.create_model() # n-step learning, get the truncated n-step return def get_n_step_info(self, n_step_buffer, gamma): """Return n step reward, next state, and done.""" # info of the last transition reward, next_state, done = n_step_buffer[-1][-3:] for transition in reversed(list(n_step_buffer)[:-1]): r, n_s, d = transition[-3:] reward = r + gamma * reward * (1 - d) next_state, done = (n_s, d) if d else (next_state, done) return reward, next_state, done def store(self, experience): self.n_step_buffer.append(experience) if len(self.n_step_buffer) == self.n_step: reward, next_state, done = self.get_n_step_info( self.n_step_buffer, self.gamma) state, action = self.n_step_buffer[0][:2] self.experience_replay.add(obs=state, act=action, rew=reward, next_obs=next_state, done=done) def create_model(self): inputs = tf.keras.Input(shape=(state_size, )) fc1 = tf.keras.layers.Dense(128, activation='relu')(inputs) fc2 = tf.keras.layers.Dense(128, activation='relu')(fc1) advantage_output = tf.keras.layers.Dense(action_size, activation='linear')(fc2) value_out = tf.keras.layers.Dense(1, activation='linear')(fc2) norm_advantage_output = tf.keras.layers.Lambda( lambda x: x - tf.reduce_mean(x))(advantage_output) outputs = tf.keras.layers.Add()([value_out, norm_advantage_output]) model = tf.keras.Model(inputs, outputs) model.compile(optimizer=tf.keras.optimizers.Adam(self.learning_rate), loss=tf.keras.losses.MeanSquaredError(), metrics=['accuracy']) model.summary() return model def train(self): if self.experience_replay.get_stored_size() > self.batch_size: samples = self.experience_replay.sample(self.batch_size) td_errors, loss = self._train_body(samples) self.experience_replay.update_priorities(samples["indexes"], td_errors.numpy() + 1e-6) @tf.function def _train_body(self, samples): with tf.GradientTape() as tape: td_errors = self._compute_td_error_body(samples["obs"], samples["act"], samples["rew"], samples["next_obs"], samples["done"]) loss = tf.reduce_mean( tf.square(td_errors)) # huber loss seems no use gradients = tape.gradient(loss, self.model.trainable_variables) self.model.optimizer.apply_gradients( zip(gradients, self.model.trainable_variables)) return td_errors, loss @tf.function def _compute_td_error_body(self, states, actions, rewards, next_states, dones): rewards = tf.cast(tf.squeeze(rewards), dtype=tf.float32) dones = tf.cast(tf.squeeze(dones), dtype=tf.bool) actions = tf.cast(actions, dtype=tf.int32) # (batch_size, 1) batch_size_range = tf.expand_dims(tf.range(self.batch_size), axis=1) # (batch_size, 1) # get current q value current_q_indexes = tf.concat(values=(batch_size_range, actions), axis=1) # (batch_size, 2) current_q = tf.gather_nd(self.model(states), current_q_indexes) # (batch_size, ) # get target q value using double dqn max_next_q_indexes = tf.argmax(self.model(next_states), axis=1, output_type=tf.int32) # (batch_size, ) indexes = tf.concat(values=(batch_size_range, tf.expand_dims(max_next_q_indexes, axis=1)), axis=1) # (batch_size, 2) target_q = tf.gather_nd(self.target_model(next_states), indexes) # (batch_size, ) target_q = tf.where(dones, rewards, rewards + self.gamma * target_q) # (batch_size, ) # don't want change the weights of target network in backpropagation, so tf.stop_gradient() # but seems no use td_errors = tf.abs(current_q - tf.stop_gradient(target_q)) return td_errors def select_action(self, state): self.target_network_counter += 1 if self.target_network_counter % self.fixed_q_value_steps == 0: self.target_model.set_weights(self.model.get_weights()) self.epsilon = max(self.epsilon - self.linear_annealed, self.min_epsilon) if np.random.sample() <= self.epsilon: return np.random.randint(action_size) return self._get_action_body(state).numpy() @tf.function def _get_action_body(self, state): state = tf.expand_dims(state, axis=0) qvalues = self.model(state)[0] return tf.argmax(qvalues)
tf.constant(sample["rew"].ravel()), tf.constant(sample["done"].ravel()), discount, tf.constant(env.action_space.n)) absTD = tf.math.abs(target_Q - Q) loss = tf.reduce_mean(loss_func(absTD)) grad = tape.gradient(loss,model.trainable_weights) optimizer.apply_gradients(zip(grad,model.trainable_weights)) tf.summary.scalar("Loss vs training step", data=loss, step=n_step) Q = Q_func(model, tf.constant(sample["obs"]), tf.constant(sample["act"].ravel()), tf.constant(env.action_space.n)) absTD = tf.math.abs(target_Q - Q) rb.update_priorities(sample["indexes"],tf.math.maximum(absTD,tf.constant(1.0))) if done: env.reset() rb.on_episode_end() n_episode += 1 if n_step % target_update_freq == 0: target_model.set_weights(model.get_weights()) if n_step % eval_freq == eval_freq-1: eval_rew = evaluate(model,eval_env) tf.summary.scalar("episode reward vs training step",data=eval_rew,step=n_step)
class ReplayBuffer(): def __init__(self, args): #self.memory = deque(maxlen=args.buffer_size) self.memory = PrioritizedReplayBuffer( args.buffer_size, { "obs": { "shape": (64, 64, 6) }, "act": {}, "rew": {}, "next_obs": { "shape": (64, 64, 6) }, "terminal": {} }) #self.priority = deque(maxlen=args.buffer_size) self.length = 0 self.args = args def load_queues(self, queues, q_network, target_network, lock, args): for q in queues: for i in range(int(q.qsize())): # Read from the queue # The critical section begins lock.acquire() data = queue_to_data(q.get()) lock.release() # Convert to numpy for storage state = data[0].numpy() action = data[1].numpy() reward = data[2].numpy() next_state = data[3].numpy() terminal = data[4].numpy() #data_np = (state,action,reward,next_state,terminal) # Push to the buffer #self.memory.append(data_np) self.memory.add(obs=state, act=action, rew=reward, next_obs=next_state, terminal=terminal) self.length = min(self.args.buffer_size, self.length + 1) def prepare_batch(self, target_network, q_network): batch_size = min(self.length, self.args.batch_size) sample = self.memory.sample(batch_size) s = t.tensor(sample['obs']) a = t.tensor(sample['act']) r = t.tensor(sample['rew']) ns = t.tensor(sample['next_obs']) term = t.tensor(sample['terminal']) states = s.permute(0, 3, 1, 2).to(Device.get_device()) actions = a.type(t.int64).to(Device.get_device()) rewards = r.to(Device.get_device()) next_states = ns.permute(0, 3, 1, 2).to(Device.get_device()) terminals = term.to(Device.get_device()) indexes = sample["indexes"] with t.no_grad(): target = rewards + terminals * self.args.gamma * target_network( next_states).max() predicted = q_network(states).gather(1, actions) new_priorities = f.smooth_l1_loss(predicted, target, reduction='none').cpu().numpy() new_priorities[new_priorities < 1] = 1 self.memory.update_priorities(indexes, new_priorities) return states, actions, rewards, next_states, terminals def __len__(self): return self.length