def _dqn(env, writer=DummyWriter()): _model = nature_dqn(env).to(device) _optimizer = Adam(_model.parameters(), lr=lr, eps=eps) q = QNetwork(_model, _optimizer, env.action_space.n, target=FixedTarget(target_update_frequency), loss=smooth_l1_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr']) q = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget( self.hyperparameters['target_update_frequency']), writer=writer, ) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device) return C51(q, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, self.hyperparameters["final_exploration_step"] - self.hyperparameters["replay_start_size"], name="epsilon", writer=writer, ), discount_factor=self.hyperparameters["discount_factor"], minibatch_size=self.hyperparameters["minibatch_size"], replay_start_size=self.hyperparameters["replay_start_size"], update_frequency=self.hyperparameters["update_frequency"], writer=writer)
def _ddqn(env, writer=DummyWriter()): action_repeat = 1 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency final_exploration_step = final_exploration_frame / action_repeat model = model_constructor(env).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QNetwork(model, optimizer, scheduler=CosineAnnealingLR(optimizer, last_update), target=FixedTarget(target_update_frequency), writer=writer) policy = SharedAutonomyPolicy(q, env.action_space.n, epsilon=0, pilot_tol=pilot_tol) if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=alpha, beta=beta, device=device) else: replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return co_DDQN(q, policy, replay_buffer, loss=weighted_smooth_l1_loss, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency)
def test_target(self): self.policy = DeterministicPolicy(self.model, self.optimizer, self.space, target=FixedTarget(3)) state = State(torch.ones(1, STATE_DIM)) # run update step, make sure target network doesn't change self.policy(state).sum().backward() self.policy.step() tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) # again... self.policy(state).sum().backward() self.policy.step() tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) # third time, target should be updated self.policy(state).sum().backward() self.policy.step() tt.assert_allclose( self.policy.target(state), torch.tensor([[-0.574482, -0.574482, -0.574482]]), atol=1e-4, )
def test_target(self): self.policy = DeterministicPolicy( self.model, self.optimizer, self.space, target=FixedTarget(3) ) # choose initial action state = State(torch.ones(1, STATE_DIM)) action = self.policy(state) tt.assert_equal(action, torch.zeros(1, ACTION_DIM)) # run update step, make sure target network doesn't change action.sum().backward(retain_graph=True) self.policy.step() tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) # again... action.sum().backward(retain_graph=True) self.policy.step() tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) # third time, target should be updated action.sum().backward(retain_graph=True) self.policy.step() tt.assert_allclose( self.policy.eval(state), torch.tensor([[-0.595883, -0.595883, -0.595883]]), atol=1e-4, )
def _ddqn(env, writer=DummyWriter()): model = model_constructor(env).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QNetwork(model, optimizer, target=FixedTarget(target_update_frequency), writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=alpha, beta=beta, device=device) return DDQN(q, policy, replay_buffer, discount_factor=discount_factor, replay_start_size=replay_start_size, update_frequency=update_frequency, minibatch_size=minibatch_size)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr']) q = QNetwork(self.model, optimizer, target=FixedTarget( self.hyperparameters['target_update_frequency']), writer=writer) policy = GreedyPolicy( q, self.n_actions, epsilon=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], self.hyperparameters['replay_start_size'], self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], name="exploration", writer=writer)) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device) return DQN( q, policy, replay_buffer, discount_factor=self.hyperparameters['discount_factor'], minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], )
def _dqn(env, writer=DummyWriter()): model = fc_relu_q(env).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QNetwork(model, optimizer, env.action_space.n, target=FixedTarget(target_update_frequency), loss=mse_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DQN(q, policy, replay_buffer, discount_factor=discount_factor, replay_start_size=replay_start_size, update_frequency=update_frequency, minibatch_size=minibatch_size)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q_dist = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], scheduler=CosineAnnealingLR(optimizer, n_updates), v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer, ) replay_buffer = NStepReplayBuffer( self.hyperparameters['n_steps'], self.hyperparameters['discount_factor'], PrioritizedReplayBuffer( self.hyperparameters['replay_buffer_size'], alpha=self.hyperparameters['alpha'], beta=self.hyperparameters['beta'], device=self.device ) ) def agent_constructor(writer): return DeepmindAtariBody( Rainbow( q_dist, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, train_steps - self.hyperparameters['replay_start_size'], name="exploration", writer=writer ), discount_factor=self.hyperparameters['discount_factor'] ** self.hyperparameters["n_steps"], minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], writer=writer, ), lazy_frames=True, episodic_lives=True ) return MultiagentEncoder(IndependentMultiagent({ agent : agent_constructor(writers[agent]) for agent in env.agents }), env.agents, device)
def _dqn(env, writer=DummyWriter()): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency final_exploration_step = final_exploration_frame / action_repeat model = nature_dqn(env).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = QNetwork( model, optimizer, scheduler=CosineAnnealingLR(optimizer, last_update), target=FixedTarget(target_update_frequency), writer=writer ) policy = GreedyPolicy( q, env.action_space.n, epsilon=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_step - replay_start_size, name="epsilon", writer=writer ) ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device ) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=discount_factor, loss=smooth_l1_loss, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), lazy_frames=True )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q = QNetwork( self.model, optimizer, scheduler=CosineAnnealingLR(optimizer, n_updates), target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer ) policy = GreedyPolicy( q, self.n_actions, epsilon=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], self.hyperparameters['replay_start_size'], self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], name="exploration", writer=writer ) ) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device ) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=self.hyperparameters['discount_factor'], loss=smooth_l1_loss, minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], ), lazy_frames=True )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget(self.hyperparameters['target_update_frequency']), scheduler=CosineAnnealingLR(optimizer, n_updates), writer=writer, ) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device ) return DeepmindAtariBody( C51( q, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, self.hyperparameters["final_exploration_step"] - self.hyperparameters["replay_start_size"], name="epsilon", writer=writer, ), discount_factor=self.hyperparameters["discount_factor"], minibatch_size=self.hyperparameters["minibatch_size"], replay_start_size=self.hyperparameters["replay_start_size"], update_frequency=self.hyperparameters["update_frequency"], writer=writer ), lazy_frames=True, episodic_lives=True )
def _c51(env, writer=DummyWriter()): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency model = nature_c51(env, atoms=atoms).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = QDist( model, optimizer, env.action_space.n, atoms, v_min=v_min, v_max=v_max, target=FixedTarget(target_update_frequency), scheduler=CosineAnnealingLR(optimizer, last_update), writer=writer, ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device ) return DeepmindAtariBody( C51( q, replay_buffer, exploration=LinearScheduler( initial_exploration, final_exploration, 0, last_timestep, name="epsilon", writer=writer, ), discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer ), lazy_frames=True )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q_dist = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer, ) replay_buffer = NStepReplayBuffer( self.hyperparameters['n_steps'], self.hyperparameters['discount_factor'], PrioritizedReplayBuffer( self.hyperparameters['replay_buffer_size'], alpha=self.hyperparameters['alpha'], beta=self.hyperparameters['beta'], device=self.device ) ) return Rainbow( q_dist, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, train_steps - self.hyperparameters['replay_start_size'], name="exploration", writer=writer ), discount_factor=self.hyperparameters['discount_factor'] ** self.hyperparameters["n_steps"], minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], writer=writer, )
def __init__(self, policy, logger, out_dim, device="cpu"): self.hyperparameters = hyperparameters = default_hyperparameters self.policy = policy self.model = policy.model self.device = device self.logger = logger self.discount_factor = hyperparameters['discount_factor'] self.out_dim = out_dim writer = DummyWriter() optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr']) self.q = q = QNetwork( self.model, optimizer, target=FixedTarget( self.hyperparameters['target_update_frequency']), writer=writer)
def _rainbow(env, writer=DummyWriter()): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency model = model_constructor(env, atoms=atoms, sigma=sigma).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QDist( model, optimizer, env.action_space.n, atoms, scheduler=CosineAnnealingLR(optimizer, last_update), v_min=v_min, v_max=v_max, target=FixedTarget(target_update_frequency), writer=writer, ) replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=alpha, beta=beta, device=device) replay_buffer = NStepReplayBuffer(n_steps, discount_factor, replay_buffer) agent = Rainbow( q, replay_buffer, exploration=LinearScheduler(initial_exploration, final_exploration, 0, last_timestep, name='exploration', writer=writer), discount_factor=discount_factor**n_steps, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer, ) return DeepmindAtariBody(agent, lazy_frames=True, episodic_lives=True)
def test_target_net(self): torch.manual_seed(2) model = nn.Sequential(nn.Linear(1, 1)) optimizer = torch.optim.SGD(model.parameters(), lr=0.1) q = QNetwork(model, optimizer, 1, loss=smooth_l1_loss, target=FixedTarget(3)) inputs = State(torch.tensor([1.])) errors = torch.tensor([-1.]) policy_value = q(inputs).item() target_value = q.target(inputs).item() np.testing.assert_equal(policy_value, -0.008584141731262207) np.testing.assert_equal(target_value, -0.008584141731262207) q.reinforce(errors) policy_value = q(inputs).item() target_value = q.target(inputs).item() np.testing.assert_equal(policy_value, -0.20858412981033325) np.testing.assert_equal(target_value, -0.008584141731262207) q.reinforce(errors) policy_value = q(inputs).item() target_value = q.target(inputs).item() np.testing.assert_equal(policy_value, -0.4085841178894043) np.testing.assert_equal(target_value, -0.008584141731262207) q.reinforce(errors) policy_value = q(inputs).item() target_value = q.target(inputs).item() np.testing.assert_equal(policy_value, -0.6085841655731201) np.testing.assert_equal(target_value, -0.6085841655731201) q.reinforce(errors) policy_value = q(inputs).item() target_value = q.target(inputs).item() np.testing.assert_equal(policy_value, -0.8085841536521912) np.testing.assert_equal(target_value, -0.6085841655731201)
def _c51(env, writer=DummyWriter()): model = nature_c51(env, atoms=51).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = QDist( model, optimizer, env.action_space.n, atoms, v_min=v_min, v_max=v_max, target=FixedTarget(target_update_frequency), writer=writer, ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device ) return DeepmindAtariBody( C51( q, replay_buffer, exploration=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer, ), discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer ) )
def _dqn(env, writers=None): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency final_exploration_step = final_exploration_frame / action_repeat n_agents = len(env.agents) n_actions = env.action_spaces['first_0'].n model = model_constructor(env).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = Approximation( model, optimizer, scheduler=CosineAnnealingLR(optimizer, last_update), target=FixedTarget(target_update_frequency), writer=writers['first_0'] ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, store_device=device, device=device ) def agent_constructor(writer): policy = GreedyPolicy( q, n_actions, epsilon=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_step - replay_start_size, name="epsilon", writer=writer ) ) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=discount_factor, loss=smooth_l1_loss, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), lazy_frames=True ) return MultiagentEncoder(IndependentMultiagent({ agent : agent_constructor(writers[agent]) for agent in env.agents }), env.agents, device)
def _sac(env, writer=DummyWriter()): final_anneal_step = (last_frame - replay_start_size) // update_frequency v_model = v_model_constructor(env).to(device) q_1_model = q1_model_constructor(env).to(device) q_2_model = q2_model_constructor(env).to(device) #quick and dirty implementation of parallel branch un/freeze policy_model = policy_model_constructor( env=env, train_parallel=train_parallel).to(device) if pretrained_models is not None: q_1_model = pretrained_models.q_1.model.to(device) q_2_model = pretrained_models.q_2.model.to(device) v_model = pretrained_models.v.model.to(device) policy_model = pretrained_models.policy.model.to(device) q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q) q_1 = QContinuousCtrlRep(q_1_model, q_1_optimizer, scheduler=CosineAnnealingLR( q_1_optimizer, final_anneal_step), target=FixedTarget(1000), writer=writer, name='q_1') q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q) q_2 = QContinuousCtrlRep(q_2_model, q_2_optimizer, scheduler=CosineAnnealingLR( q_2_optimizer, final_anneal_step), target=FixedTarget(1000), writer=writer, name='q_2') v_optimizer = Adam(v_model.parameters(), lr=lr_v) v = VNetworkCtrlRep( v_model, v_optimizer, scheduler=CosineAnnealingLR(v_optimizer, final_anneal_step), target=PolyakTarget(polyak_rate), writer=writer, name='v', ) policy_optimizer = Adam(filter(lambda p: p.requires_grad, policy_model.parameters()), lr=lr_pi) policy = SoftDeterministicPolicyCtrlRep(policy_model, policy_optimizer, env.action_space, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step), target=FixedTarget(1000), writer=writer) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return TimeFeature( SACCtrlRep(policy=policy, q_1=q_1, q_2=q_2, v=v, replay_buffer=replay_buffer, temperature_initial=temperature_initial, entropy_target=(-env.action_space.shape[0] * entropy_target_scaling), lr_temperature=lr_temperature, replay_start_size=replay_start_size, discount_factor=discount_factor, update_frequency=update_frequency, minibatch_size=minibatch_size, writer=writer))
def setUp(self): self.model = Identity('cpu', target=FixedTarget(10))