def test_per(capacity): # test implementation of proritized replay buffer p_buffer = PrioritizedReplayBuffer(capacity) # populate the buffer for _ in range(capacity // 2): p_buffer.add(Experience()) # update batches of experience n_batches = 10 batch_size = 100 for _ in range(10): # randomly sample $batch_size of tree indices idx = random.sample([x for x in range(capacity - 1, 2 * capacity - 1)], batch_size) td_errors = np.random.uniform(0, 10, batch_size) p_buffer.batch_update(idx, td_errors) assert p_buffer.tree.max_priority == np.max( p_buffer.tree.tree[-capacity:]) # test sample for _ in range(10): p_buffer.sample(batch_size) return
Loss = weights * MSE ''' # compute MSE adjusted by importance sampling weights # and backprop weights = torch.tensor(weights, dtype=torch.float32) #print(weights, torch.pow(td_loss, 2)) loss = torch.mean(weights * torch.pow(td_loss, 2)) loss.backward() grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm) opt.step() opt.zero_grad() # update the priorities of sampled exprs exp_replay.batch_update(b_idx, np.abs(td_loss.detach().cpu().numpy())) # increase the importance sampling hyperparameter b gradually to 1 exp_replay.increment_b() if step % loss_freq == 0: # save MSE without importance sampling loss = torch.mean(torch.pow(td_loss, 2)) td_loss_history.append(loss.cpu().item()) if step % refresh_target_network_freq == 0: target_network.load_state_dict(agent.state_dict()) if step % eval_freq == 0: mean_rw_history.append( evaluate(make_env(clip_rewards=True, seed=step),