def _rainbow(env, writer=DummyWriter()): model = model_constructor(env, atoms=atoms, sigma=sigma).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QDist( model, optimizer, env.action_space.n, atoms, v_min=v_min, v_max=v_max, writer=writer, ) # replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) replay_buffer = PrioritizedReplayBuffer( replay_buffer_size, alpha=alpha, beta=beta, device=device ) replay_buffer = NStepReplayBuffer(n_steps, discount_factor, replay_buffer) return Rainbow( q, replay_buffer, exploration=0., discount_factor=discount_factor ** n_steps, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer, )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q_dist = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], scheduler=CosineAnnealingLR(optimizer, n_updates), v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer, ) replay_buffer = NStepReplayBuffer( self.hyperparameters['n_steps'], self.hyperparameters['discount_factor'], PrioritizedReplayBuffer( self.hyperparameters['replay_buffer_size'], alpha=self.hyperparameters['alpha'], beta=self.hyperparameters['beta'], device=self.device ) ) def agent_constructor(writer): return DeepmindAtariBody( Rainbow( q_dist, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, train_steps - self.hyperparameters['replay_start_size'], name="exploration", writer=writer ), discount_factor=self.hyperparameters['discount_factor'] ** self.hyperparameters["n_steps"], minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], writer=writer, ), lazy_frames=True, episodic_lives=True ) return MultiagentEncoder(IndependentMultiagent({ agent : agent_constructor(writers[agent]) for agent in env.agents }), env.agents, device)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q_dist = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer, ) replay_buffer = NStepReplayBuffer( self.hyperparameters['n_steps'], self.hyperparameters['discount_factor'], PrioritizedReplayBuffer( self.hyperparameters['replay_buffer_size'], alpha=self.hyperparameters['alpha'], beta=self.hyperparameters['beta'], device=self.device ) ) return Rainbow( q_dist, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, train_steps - self.hyperparameters['replay_start_size'], name="exploration", writer=writer ), discount_factor=self.hyperparameters['discount_factor'] ** self.hyperparameters["n_steps"], minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], writer=writer, )
def _rainbow(env, writer=DummyWriter()): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency model = model_constructor(env, atoms=atoms, sigma=sigma).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QDist( model, optimizer, env.action_space.n, atoms, scheduler=CosineAnnealingLR(optimizer, last_update), v_min=v_min, v_max=v_max, target=FixedTarget(target_update_frequency), writer=writer, ) replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=alpha, beta=beta, device=device) replay_buffer = NStepReplayBuffer(n_steps, discount_factor, replay_buffer) agent = Rainbow( q, replay_buffer, exploration=LinearScheduler(initial_exploration, final_exploration, 0, last_timestep, name='exploration', writer=writer), discount_factor=discount_factor**n_steps, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer, ) return DeepmindAtariBody(agent, lazy_frames=True, episodic_lives=True)
def setUp(self): np.random.seed(1) random.seed(1) torch.manual_seed(1) self.replay_buffer = NStepReplayBuffer(4, 0.5, ExperienceReplayBuffer(100))
class TestNStepReplayBuffer(unittest.TestCase): def setUp(self): np.random.seed(1) random.seed(1) torch.manual_seed(1) self.replay_buffer = NStepReplayBuffer(4, 0.5, ExperienceReplayBuffer(100)) def test_run(self): states = State(torch.arange(0, 20)) actions = torch.arange(0, 20) rewards = torch.arange(0, 20).float() for i in range(3): self.replay_buffer.store(states[i], actions[i], rewards[i], states[i + 1]) self.assertEqual(len(self.replay_buffer), 0) for i in range(3, 6): self.replay_buffer.store(states[i], actions[i], rewards[i], states[i + 1]) self.assertEqual(len(self.replay_buffer), i - 2) sample = self.replay_buffer.buffer.buffer[0] self.assert_states_equal(sample[0], states[0]) tt.assert_equal(sample[1], actions[0]) tt.assert_equal(sample[2], torch.tensor(0 + 1 * 0.5 + 2 * 0.25 + 3 * 0.125)) tt.assert_equal( self.replay_buffer.buffer.buffer[1][2], torch.tensor(1 + 2 * 0.5 + 3 * 0.25 + 4 * 0.125), ) def test_done(self): state = State(torch.tensor([1])) action = torch.tensor(0) done_state = State(torch.tensor([1]), mask=torch.tensor([0])) self.replay_buffer.store(state, action, 1, done_state) self.assertEqual(len(self.replay_buffer), 1) sample = self.replay_buffer.buffer.buffer[0] self.assert_states_equal(state, sample[0]) self.assertEqual(sample[2], 1) self.replay_buffer.store(state, action, 1, state) self.replay_buffer.store(state, action, 1, state) self.assertEqual(len(self.replay_buffer), 1) self.replay_buffer.store(state, action, 1, done_state) self.assertEqual(len(self.replay_buffer), 4) sample = self.replay_buffer.buffer.buffer[1] self.assert_states_equal(sample[0], state) self.assertEqual(sample[2], 1.75) self.assert_states_equal(sample[3], done_state) self.replay_buffer.store(state, action, 1, done_state) self.assertEqual(len(self.replay_buffer), 5) sample = self.replay_buffer.buffer.buffer[0] self.assert_states_equal(state, sample[0]) self.assertEqual(sample[2], 1) def assert_states_equal(self, actual, expected): tt.assert_almost_equal(actual.raw, expected.raw) tt.assert_equal(actual.mask, expected.mask)