def setUp(self): torch.manual_seed(2) self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTIONS)) def optimizer(params): return torch.optim.SGD(params, lr=0.1) self.q = QNetwork(self.model, optimizer)
def _ddqn(env, writer=DummyWriter()): model = model_constructor(env).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QNetwork(model, optimizer, target=FixedTarget(target_update_frequency), writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=alpha, beta=beta, device=device) return DDQN(q, policy, replay_buffer, discount_factor=discount_factor, replay_start_size=replay_start_size, update_frequency=update_frequency, minibatch_size=minibatch_size)
def _vqn(envs, writer=DummyWriter()): env = envs[0] model = fc_relu_q(env).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QNetwork(model, optimizer, writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=epsilon) return VQN(q, policy, discount_factor=discount_factor)
def _model_predictive_dqn(env, writer=None): # models feature_model = shared_feature_layers().to(device) value_model = value_head().to(device) reward_model = reward_head(env).to(device) generator_model = Generator(env).to(device) # optimizers feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) reward_optimizer = Adam(reward_model.parameters(), lr=lr, eps=eps) generator_optimizer = Adam(generator_model.parameters(), lr=lr, eps=eps) # approximators f = FeatureNetwork(feature_model, feature_optimizer, writer=writer) v = VNetwork(value_model, value_optimizer, writer=writer) r = QNetwork(reward_model, reward_optimizer, name='reward', writer=writer) g = Approximation(generator_model, generator_optimizer, name='generator', writer=writer) # replay buffer replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) # create agent agent = ModelPredictiveDQN(f, v, r, g, replay_buffer, minibatch_size=minibatch_size, replay_start_size=replay_start_size ) # apply agent wrappers for better atari performance return DeepmindAtariBody(agent, lazy_frames=True)
def _rainbow(env, writer=DummyWriter()): model = build_model(env, sigma_init).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QNetwork( model, optimizer, env.action_space.n, target_update_frequency=target_update_frequency, loss=mse_loss, writer=writer ) policy = GreedyPolicy( q, env.action_space.n, initial_epsilon=1, final_epsilon=0, annealing_start=replay_start_size, annealing_time=1 ) # replay_buffer = ExperienceReplayBuffer(replay_buffer_size) replay_buffer = PrioritizedReplayBuffer( replay_buffer_size, alpha=alpha, beta=beta, final_beta_frame=final_beta_frame, device=device ) return DQN(q, policy, replay_buffer, discount_factor=discount_factor, replay_start_size=replay_start_size, update_frequency=update_frequency, minibatch_size=minibatch_size)
def _vqn(envs, writer=DummyWriter()): env = envs[0] model = nature_ddqn(env).to(device) optimizer = RMSprop(model.parameters(), lr=lr, alpha=alpha, eps=eps) q = QNetwork( model, optimizer, env.action_space.n, loss=smooth_l1_loss, writer=writer ) policy = GreedyPolicy( q, env.action_space.n, epsilon=LinearScheduler( initial_exploration, final_exploration, 0, final_exploration_frame, name="epsilon", writer=writer ) ) return DeepmindAtariBody( VQN(q, policy, gamma=discount_factor), )
def parallel_test_agent(self): q = QNetwork(copy.deepcopy(self.model)) policy = ParallelGreedyPolicy( q, self.n_actions, epsilon=self.hyperparameters["test_exploration"]) return VSarsaTestAgent(policy)
def _dqn(env, writer=DummyWriter()): model = fc_relu_q(env).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QNetwork(model, optimizer, env.action_space.n, target=FixedTarget(target_update_frequency), loss=mse_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DQN(q, policy, replay_buffer, discount_factor=discount_factor, replay_start_size=replay_start_size, update_frequency=update_frequency, minibatch_size=minibatch_size)
def parallel_test_agent(self): q = QNetwork(copy.deepcopy(self.model)) policy = ParallelGreedyPolicy( q, self.n_actions, epsilon=self.hyperparameters['test_exploration']) return DeepmindAtariBody(VQNTestAgent(policy))
def _ddqn(env, writer=DummyWriter()): action_repeat = 1 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency final_exploration_step = final_exploration_frame / action_repeat model = model_constructor(env).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QNetwork(model, optimizer, scheduler=CosineAnnealingLR(optimizer, last_update), target=FixedTarget(target_update_frequency), writer=writer) policy = SharedAutonomyPolicy(q, env.action_space.n, epsilon=0, pilot_tol=pilot_tol) if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=alpha, beta=beta, device=device) else: replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return co_DDQN(q, policy, replay_buffer, loss=weighted_smooth_l1_loss, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps / self.hyperparameters['n_envs'] optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q = QNetwork( self.model, optimizer, scheduler=CosineAnnealingLR(optimizer, n_updates), writer=writer ) policy = ParallelGreedyPolicy( q, self.n_actions, epsilon=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, self.hyperparameters["final_exploration_step"] / self.hyperparameters["n_envs"], name="exploration", writer=writer ) ) return VQN(q, policy, discount_factor=self.hyperparameters['discount_factor'])
def _vqn(envs, writer=DummyWriter()): action_repeat = 4 final_exploration_timestep = final_exploration_frame / action_repeat env = envs[0] model = model_constructor(env).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QNetwork( model, optimizer, writer=writer ) policy = ParallelGreedyPolicy( q, env.action_space.n, epsilon=LinearScheduler( initial_exploration, final_exploration, 0, final_exploration_timestep, name="epsilon", writer=writer ) ) return DeepmindAtariBody( VQN(q, policy, discount_factor=discount_factor), )
def _dqn(env, writer=DummyWriter()): _model = nature_dqn(env).to(device) _optimizer = Adam(_model.parameters(), lr=lr, eps=eps) q = QNetwork(_model, _optimizer, env.action_space.n, target=FixedTarget(target_update_frequency), loss=smooth_l1_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr']) q = QNetwork(self.model, optimizer, target=FixedTarget( self.hyperparameters['target_update_frequency']), writer=writer) policy = GreedyPolicy( q, self.n_actions, epsilon=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], self.hyperparameters['replay_start_size'], self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], name="exploration", writer=writer)) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device) return DQN( q, policy, replay_buffer, discount_factor=self.hyperparameters['discount_factor'], minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], )
def _dqn(env, writer=DummyWriter()): _model = model _optimizer = optimizer if _model is None: _model = conv_net(env, frames=agent_history_length).to(device) if _optimizer is None: _optimizer = Adam(_model.parameters(), lr=lr, eps=eps) q = QNetwork(_model, _optimizer, env.action_space.n, target_update_frequency=target_update_frequency, loss=smooth_l1_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, annealing_start=replay_start_size, annealing_time=final_exploration_frame - replay_start_size, initial_epsilon=initial_exploration, final_epsilon=final_exploration) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DeepmindAtariBody(DQN( q, policy, replay_buffer, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), env, action_repeat=action_repeat, frame_stack=agent_history_length, noop_max=noop_max)
def _vsarsa(envs, writer=DummyWriter()): env = envs[0] model = model_constructor(env).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QNetwork(model, optimizer, writer=writer) policy = ParallelGreedyPolicy(q, env.action_space.n, epsilon=epsilon) return VSarsa(q, policy, discount_factor=discount_factor)
def _vsarsa(envs, writer=DummyWriter()): env = envs[0] model = fc_relu_q(env).to(device) optimizer = RMSprop(model.parameters(), lr=lr, alpha=alpha, eps=eps) q = QNetwork(model, optimizer, env.action_space.n, writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=epsilon) return VSarsa(q, policy, gamma=gamma)
def test_agent(self): q = QNetwork(copy.deepcopy(self.model)) return DeepmindAtariBody( DDQNTestAgent( q, self.n_actions, exploration=self.hyperparameters['test_exploration']))
def _dqn(env, writer=DummyWriter()): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency final_exploration_step = final_exploration_frame / action_repeat model = nature_dqn(env).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = QNetwork( model, optimizer, scheduler=CosineAnnealingLR(optimizer, last_update), target=FixedTarget(target_update_frequency), writer=writer ) policy = GreedyPolicy( q, env.action_space.n, epsilon=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_step - replay_start_size, name="epsilon", writer=writer ) ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device ) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=discount_factor, loss=smooth_l1_loss, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), lazy_frames=True )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q = QNetwork( self.model, optimizer, scheduler=CosineAnnealingLR(optimizer, n_updates), target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer ) policy = GreedyPolicy( q, self.n_actions, epsilon=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], self.hyperparameters['replay_start_size'], self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], name="exploration", writer=writer ) ) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device ) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=self.hyperparameters['discount_factor'], loss=smooth_l1_loss, minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], ), lazy_frames=True )
def __init__(self, policy, logger, out_dim, device="cpu"): self.hyperparameters = hyperparameters = default_hyperparameters self.policy = policy self.model = policy.model self.device = device self.logger = logger self.discount_factor = hyperparameters['discount_factor'] self.out_dim = out_dim writer = DummyWriter() optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr']) self.q = q = QNetwork( self.model, optimizer, target=FixedTarget( self.hyperparameters['target_update_frequency']), writer=writer)
def _dqn(env, writer=DummyWriter()): model = build_model(env).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QNetwork(model, optimizer, env.action_space.n, target_update_frequency=target_update_frequency, loss=mse_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, initial_epsilon=initial_exploration, final_epsilon=final_exploration, annealing_time=final_exploration_frame) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DQN(q, policy, replay_buffer, discount_factor=discount_factor, replay_start_size=replay_start_size, update_frequency=update_frequency, minibatch_size=minibatch_size)
def agent(self, writer=DummyWriter(), train_steps=float("inf")): # optimizers feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) reward_optimizer = Adam(self.reward_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) generator_optimizer = Adam(self.generator_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) # approximators f = FeatureNetwork(self.feature_model, feature_optimizer, writer=writer) v = VNetwork(self.value_model, value_optimizer, writer=writer) r = QNetwork(self.reward_model, reward_optimizer, name="reward", writer=writer) g = Approximation(self.generator_model, generator_optimizer, name="generator", writer=writer) # replay buffer replay_buffer = ExperienceReplayBuffer(self.hyperparameters["replay_buffer_size"], device=self.device) # create agent agent = ModelBasedDQN(f, v, r, g, replay_buffer, minibatch_size=self.hyperparameters["minibatch_size"], replay_start_size=self.hyperparameters["replay_start_size"] ) # apply atari wrappers for better performance return DeepmindAtariBody(agent, lazy_frames=True)
def test_agent(self): q = QNetwork(copy.deepcopy(self.model)) return VSarsaTestAgent(q, self.n_actions, exploration=self.hyperparameters['test_exploration'])
def test_agent(self): q = QNetwork(copy.deepcopy(self.model)) policy = GreedyPolicy(q, self.n_actions, epsilon=self.hyperparameters['test_exploration']) return DQNTestAgent(policy)
def _sarsa(env, writer=DummyWriter()): model = fc_net(env).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QNetwork(model, optimizer, env.action_space.n, writer=writer) policy = GreedyPolicy(q, env.action_space.n, annealing_time=1, final_epsilon=epsilon) return Sarsa(q, policy)
def test_target_net(self): torch.manual_seed(2) model = nn.Sequential(nn.Linear(1, 1)) optimizer = torch.optim.SGD(model.parameters(), lr=0.1) q = QNetwork(model, optimizer, target=FixedTarget(3)) inputs = State(torch.tensor([1.])) def loss(policy_value): target = policy_value - 1 return smooth_l1_loss(policy_value, target.detach()) policy_value = q(inputs) target_value = q.target(inputs).item() np.testing.assert_equal(policy_value.item(), -0.008584141731262207) np.testing.assert_equal(target_value, -0.008584141731262207) q.reinforce(loss(policy_value)) policy_value = q(inputs) target_value = q.target(inputs).item() np.testing.assert_equal(policy_value.item(), -0.20858412981033325) np.testing.assert_equal(target_value, -0.008584141731262207) q.reinforce(loss(policy_value)) policy_value = q(inputs) target_value = q.target(inputs).item() np.testing.assert_equal(policy_value.item(), -0.4085841178894043) np.testing.assert_equal(target_value, -0.008584141731262207) q.reinforce(loss(policy_value)) policy_value = q(inputs) target_value = q.target(inputs).item() np.testing.assert_equal(policy_value.item(), -0.6085841655731201) np.testing.assert_equal(target_value, -0.6085841655731201) q.reinforce(loss(policy_value)) policy_value = q(inputs) target_value = q.target(inputs).item() np.testing.assert_equal(policy_value.item(), -0.8085841536521912) np.testing.assert_equal(target_value, -0.6085841655731201)
class TestQNetwork(unittest.TestCase): def setUp(self): torch.manual_seed(2) self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTIONS)) def optimizer(params): return torch.optim.SGD(params, lr=0.1) self.q = QNetwork(self.model, optimizer) def test_eval_list(self): states = State(torch.randn(5, STATE_DIM), mask=torch.tensor([1, 1, 0, 1, 0])) result = self.q.eval(states) tt.assert_almost_equal(result, torch.tensor( [[-0.238509, -0.726287, -0.034026], [-0.35688755, -0.6612102, 0.34849477], [0., 0., 0.], [0.1944, -0.5536, -0.2345], [0., 0., 0.]]), decimal=2) def test_eval_actions(self): states = State(torch.randn(3, STATE_DIM)) actions = [1, 2, 0] result = self.q.eval(states, actions) self.assertEqual(result.shape, torch.Size([3])) tt.assert_almost_equal( result, torch.tensor([-0.7262873, 0.3484948, -0.0296164])) def test_target_net(self): torch.manual_seed(2) model = nn.Sequential(nn.Linear(1, 1)) optimizer = torch.optim.SGD(model.parameters(), lr=0.1) q = QNetwork(model, optimizer, target=FixedTarget(3)) inputs = State(torch.tensor([1.])) def loss(policy_value): target = policy_value - 1 return smooth_l1_loss(policy_value, target.detach()) policy_value = q(inputs) target_value = q.target(inputs).item() np.testing.assert_equal(policy_value.item(), -0.008584141731262207) np.testing.assert_equal(target_value, -0.008584141731262207) q.reinforce(loss(policy_value)) policy_value = q(inputs) target_value = q.target(inputs).item() np.testing.assert_equal(policy_value.item(), -0.20858412981033325) np.testing.assert_equal(target_value, -0.008584141731262207) q.reinforce(loss(policy_value)) policy_value = q(inputs) target_value = q.target(inputs).item() np.testing.assert_equal(policy_value.item(), -0.4085841178894043) np.testing.assert_equal(target_value, -0.008584141731262207) q.reinforce(loss(policy_value)) policy_value = q(inputs) target_value = q.target(inputs).item() np.testing.assert_equal(policy_value.item(), -0.6085841655731201) np.testing.assert_equal(target_value, -0.6085841655731201) q.reinforce(loss(policy_value)) policy_value = q(inputs) target_value = q.target(inputs).item() np.testing.assert_equal(policy_value.item(), -0.8085841536521912) np.testing.assert_equal(target_value, -0.6085841655731201)
def test_agent(self): f = FeatureNetwork(self.feature_model, None) v = VNetwork(self.value_model, None) r = QNetwork(self.reward_model, None) g = Approximation(self.generator_model, None) return DeepmindAtariBody(ModelBasedTestAgent(f, v, r, g, self.hyperparameters["discount_factor"]))