Beispiel #1
0
    def setUp(self):
        torch.manual_seed(2)
        self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTIONS))

        def optimizer(params):
            return torch.optim.SGD(params, lr=0.1)

        self.q = QNetwork(self.model, optimizer)
Beispiel #2
0
 def _ddqn(env, writer=DummyWriter()):
     model = model_constructor(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QNetwork(model,
                  optimizer,
                  target=FixedTarget(target_update_frequency),
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           epsilon=LinearScheduler(initial_exploration,
                                                   final_exploration,
                                                   replay_start_size,
                                                   final_exploration_frame,
                                                   name="epsilon",
                                                   writer=writer))
     replay_buffer = PrioritizedReplayBuffer(replay_buffer_size,
                                             alpha=alpha,
                                             beta=beta,
                                             device=device)
     return DDQN(q,
                 policy,
                 replay_buffer,
                 discount_factor=discount_factor,
                 replay_start_size=replay_start_size,
                 update_frequency=update_frequency,
                 minibatch_size=minibatch_size)
Beispiel #3
0
 def _vqn(envs, writer=DummyWriter()):
     env = envs[0]
     model = fc_relu_q(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr, eps=eps)
     q = QNetwork(model, optimizer, writer=writer)
     policy = GreedyPolicy(q, env.action_space.n, epsilon=epsilon)
     return VQN(q, policy, discount_factor=discount_factor)
Beispiel #4
0
 def _model_predictive_dqn(env, writer=None):
     # models
     feature_model = shared_feature_layers().to(device)
     value_model = value_head().to(device)
     reward_model = reward_head(env).to(device)
     generator_model = Generator(env).to(device)
     # optimizers
     feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
     value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
     reward_optimizer = Adam(reward_model.parameters(), lr=lr, eps=eps)
     generator_optimizer = Adam(generator_model.parameters(), lr=lr, eps=eps)
     # approximators
     f = FeatureNetwork(feature_model, feature_optimizer, writer=writer)
     v = VNetwork(value_model, value_optimizer, writer=writer)
     r = QNetwork(reward_model, reward_optimizer, name='reward', writer=writer)
     g = Approximation(generator_model, generator_optimizer, name='generator', writer=writer)
     # replay buffer
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device)
     # create agent
     agent = ModelPredictiveDQN(f, v, r, g, replay_buffer,
         minibatch_size=minibatch_size,
         replay_start_size=replay_start_size
     )
     # apply agent wrappers for better atari performance
     return DeepmindAtariBody(agent, lazy_frames=True)
Beispiel #5
0
 def _rainbow(env, writer=DummyWriter()):
     model = build_model(env, sigma_init).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QNetwork(
         model,
         optimizer,
         env.action_space.n,
         target_update_frequency=target_update_frequency,
         loss=mse_loss,
         writer=writer
     )
     policy = GreedyPolicy(
         q,
         env.action_space.n,
         initial_epsilon=1,
         final_epsilon=0,
         annealing_start=replay_start_size,
         annealing_time=1
     )
     # replay_buffer = ExperienceReplayBuffer(replay_buffer_size)
     replay_buffer = PrioritizedReplayBuffer(
         replay_buffer_size,
         alpha=alpha,
         beta=beta,
         final_beta_frame=final_beta_frame,
         device=device
     )
     return DQN(q, policy, replay_buffer,
                discount_factor=discount_factor,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
                minibatch_size=minibatch_size)
 def _vqn(envs, writer=DummyWriter()):
     env = envs[0]
     model = nature_ddqn(env).to(device)
     optimizer = RMSprop(model.parameters(), lr=lr, alpha=alpha, eps=eps)
     q = QNetwork(
         model,
         optimizer,
         env.action_space.n,
         loss=smooth_l1_loss,
         writer=writer
     )
     policy = GreedyPolicy(
         q,
         env.action_space.n,
         epsilon=LinearScheduler(
             initial_exploration,
             final_exploration,
             0,
             final_exploration_frame,
             name="epsilon",
             writer=writer
         )
     )
     return DeepmindAtariBody(
         VQN(q, policy, gamma=discount_factor),
     )
 def parallel_test_agent(self):
     q = QNetwork(copy.deepcopy(self.model))
     policy = ParallelGreedyPolicy(
         q,
         self.n_actions,
         epsilon=self.hyperparameters["test_exploration"])
     return VSarsaTestAgent(policy)
 def _dqn(env, writer=DummyWriter()):
     model = fc_relu_q(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QNetwork(model,
                  optimizer,
                  env.action_space.n,
                  target=FixedTarget(target_update_frequency),
                  loss=mse_loss,
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           epsilon=LinearScheduler(initial_exploration,
                                                   final_exploration,
                                                   replay_start_size,
                                                   final_exploration_frame,
                                                   name="epsilon",
                                                   writer=writer))
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return DQN(q,
                policy,
                replay_buffer,
                discount_factor=discount_factor,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
                minibatch_size=minibatch_size)
 def parallel_test_agent(self):
     q = QNetwork(copy.deepcopy(self.model))
     policy = ParallelGreedyPolicy(
         q,
         self.n_actions,
         epsilon=self.hyperparameters['test_exploration'])
     return DeepmindAtariBody(VQNTestAgent(policy))
Beispiel #10
0
    def _ddqn(env, writer=DummyWriter()):
        action_repeat = 1
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency
        final_exploration_step = final_exploration_frame / action_repeat

        model = model_constructor(env).to(device)
        optimizer = Adam(model.parameters(), lr=lr, eps=eps)
        q = QNetwork(model,
                     optimizer,
                     scheduler=CosineAnnealingLR(optimizer, last_update),
                     target=FixedTarget(target_update_frequency),
                     writer=writer)
        policy = SharedAutonomyPolicy(q,
                                      env.action_space.n,
                                      epsilon=0,
                                      pilot_tol=pilot_tol)

        if prioritized_replay:
            replay_buffer = PrioritizedReplayBuffer(replay_buffer_size,
                                                    alpha=alpha,
                                                    beta=beta,
                                                    device=device)
        else:
            replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                                   device=device)

        return co_DDQN(q,
                       policy,
                       replay_buffer,
                       loss=weighted_smooth_l1_loss,
                       discount_factor=discount_factor,
                       minibatch_size=minibatch_size,
                       replay_start_size=replay_start_size,
                       update_frequency=update_frequency)
Beispiel #11
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps / self.hyperparameters['n_envs']

        optimizer = Adam(
            self.model.parameters(),
            lr=self.hyperparameters['lr'],
            eps=self.hyperparameters['eps']
        )

        q = QNetwork(
            self.model,
            optimizer,
            scheduler=CosineAnnealingLR(optimizer, n_updates),
            writer=writer
        )

        policy = ParallelGreedyPolicy(
            q,
            self.n_actions,
            epsilon=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                0,
                self.hyperparameters["final_exploration_step"] / self.hyperparameters["n_envs"],
                name="exploration",
                writer=writer
            )
        )

        return VQN(q, policy, discount_factor=self.hyperparameters['discount_factor'])
    def _vqn(envs, writer=DummyWriter()):
        action_repeat = 4
        final_exploration_timestep = final_exploration_frame / action_repeat

        env = envs[0]
        model = model_constructor(env).to(device)
        optimizer = Adam(model.parameters(), lr=lr, eps=eps)
        q = QNetwork(
            model,
            optimizer,
            writer=writer
        )
        policy = ParallelGreedyPolicy(
            q,
            env.action_space.n,
            epsilon=LinearScheduler(
                initial_exploration,
                final_exploration,
                0,
                final_exploration_timestep,
                name="epsilon",
                writer=writer
            )
        )
        return DeepmindAtariBody(
            VQN(q, policy, discount_factor=discount_factor),
        )
 def _dqn(env, writer=DummyWriter()):
     _model = nature_dqn(env).to(device)
     _optimizer = Adam(_model.parameters(), lr=lr, eps=eps)
     q = QNetwork(_model,
                  _optimizer,
                  env.action_space.n,
                  target=FixedTarget(target_update_frequency),
                  loss=smooth_l1_loss,
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           epsilon=LinearScheduler(initial_exploration,
                                                   final_exploration,
                                                   replay_start_size,
                                                   final_exploration_frame,
                                                   name="epsilon",
                                                   writer=writer))
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return DeepmindAtariBody(
         DQN(
             q,
             policy,
             replay_buffer,
             discount_factor=discount_factor,
             minibatch_size=minibatch_size,
             replay_start_size=replay_start_size,
             update_frequency=update_frequency,
         ), )
Beispiel #14
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        optimizer = Adam(self.model.parameters(),
                         lr=self.hyperparameters['lr'])

        q = QNetwork(self.model,
                     optimizer,
                     target=FixedTarget(
                         self.hyperparameters['target_update_frequency']),
                     writer=writer)

        policy = GreedyPolicy(
            q,
            self.n_actions,
            epsilon=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                self.hyperparameters['replay_start_size'],
                self.hyperparameters['final_exploration_step'] -
                self.hyperparameters['replay_start_size'],
                name="exploration",
                writer=writer))

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'], device=self.device)

        return DQN(
            q,
            policy,
            replay_buffer,
            discount_factor=self.hyperparameters['discount_factor'],
            minibatch_size=self.hyperparameters['minibatch_size'],
            replay_start_size=self.hyperparameters['replay_start_size'],
            update_frequency=self.hyperparameters['update_frequency'],
        )
Beispiel #15
0
 def _dqn(env, writer=DummyWriter()):
     _model = model
     _optimizer = optimizer
     if _model is None:
         _model = conv_net(env, frames=agent_history_length).to(device)
     if _optimizer is None:
         _optimizer = Adam(_model.parameters(), lr=lr, eps=eps)
     q = QNetwork(_model,
                  _optimizer,
                  env.action_space.n,
                  target_update_frequency=target_update_frequency,
                  loss=smooth_l1_loss,
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           annealing_start=replay_start_size,
                           annealing_time=final_exploration_frame -
                           replay_start_size,
                           initial_epsilon=initial_exploration,
                           final_epsilon=final_exploration)
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return DeepmindAtariBody(DQN(
         q,
         policy,
         replay_buffer,
         discount_factor=discount_factor,
         minibatch_size=minibatch_size,
         replay_start_size=replay_start_size,
         update_frequency=update_frequency,
     ),
                              env,
                              action_repeat=action_repeat,
                              frame_stack=agent_history_length,
                              noop_max=noop_max)
 def _vsarsa(envs, writer=DummyWriter()):
     env = envs[0]
     model = model_constructor(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr, eps=eps)
     q = QNetwork(model, optimizer, writer=writer)
     policy = ParallelGreedyPolicy(q, env.action_space.n, epsilon=epsilon)
     return VSarsa(q, policy, discount_factor=discount_factor)
Beispiel #17
0
 def _vsarsa(envs, writer=DummyWriter()):
     env = envs[0]
     model = fc_relu_q(env).to(device)
     optimizer = RMSprop(model.parameters(), lr=lr, alpha=alpha, eps=eps)
     q = QNetwork(model, optimizer, env.action_space.n, writer=writer)
     policy = GreedyPolicy(q, env.action_space.n, epsilon=epsilon)
     return VSarsa(q, policy, gamma=gamma)
Beispiel #18
0
 def test_agent(self):
     q = QNetwork(copy.deepcopy(self.model))
     return DeepmindAtariBody(
         DDQNTestAgent(
             q,
             self.n_actions,
             exploration=self.hyperparameters['test_exploration']))
    def _dqn(env, writer=DummyWriter()):
        action_repeat = 4
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency
        final_exploration_step = final_exploration_frame / action_repeat

        model = nature_dqn(env).to(device)

        optimizer = Adam(
            model.parameters(),
            lr=lr,
            eps=eps
        )

        q = QNetwork(
            model,
            optimizer,
            scheduler=CosineAnnealingLR(optimizer, last_update),
            target=FixedTarget(target_update_frequency),
            writer=writer
        )

        policy = GreedyPolicy(
            q,
            env.action_space.n,
            epsilon=LinearScheduler(
                initial_exploration,
                final_exploration,
                replay_start_size,
                final_exploration_step - replay_start_size,
                name="epsilon",
                writer=writer
            )
        )

        replay_buffer = ExperienceReplayBuffer(
            replay_buffer_size,
            device=device
        )

        return DeepmindAtariBody(
            DQN(
                q,
                policy,
                replay_buffer,
                discount_factor=discount_factor,
                loss=smooth_l1_loss,
                minibatch_size=minibatch_size,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
            ),
            lazy_frames=True
        )
Beispiel #20
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency']

        optimizer = Adam(
            self.model.parameters(),
            lr=self.hyperparameters['lr'],
            eps=self.hyperparameters['eps']
        )

        q = QNetwork(
            self.model,
            optimizer,
            scheduler=CosineAnnealingLR(optimizer, n_updates),
            target=FixedTarget(self.hyperparameters['target_update_frequency']),
            writer=writer
        )

        policy = GreedyPolicy(
            q,
            self.n_actions,
            epsilon=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                self.hyperparameters['replay_start_size'],
                self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'],
                name="exploration",
                writer=writer
            )
        )

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'],
            device=self.device
        )

        return DeepmindAtariBody(
            DQN(
                q,
                policy,
                replay_buffer,
                discount_factor=self.hyperparameters['discount_factor'],
                loss=smooth_l1_loss,
                minibatch_size=self.hyperparameters['minibatch_size'],
                replay_start_size=self.hyperparameters['replay_start_size'],
                update_frequency=self.hyperparameters['update_frequency'],
            ),
            lazy_frames=True
        )
Beispiel #21
0
    def __init__(self, policy, logger, out_dim, device="cpu"):
        self.hyperparameters = hyperparameters = default_hyperparameters
        self.policy = policy
        self.model = policy.model
        self.device = device
        self.logger = logger
        self.discount_factor = hyperparameters['discount_factor']
        self.out_dim = out_dim
        writer = DummyWriter()
        optimizer = Adam(self.model.parameters(),
                         lr=self.hyperparameters['lr'])

        self.q = q = QNetwork(
            self.model,
            optimizer,
            target=FixedTarget(
                self.hyperparameters['target_update_frequency']),
            writer=writer)
Beispiel #22
0
 def _dqn(env, writer=DummyWriter()):
     model = build_model(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QNetwork(model,
                  optimizer,
                  env.action_space.n,
                  target_update_frequency=target_update_frequency,
                  loss=mse_loss,
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           initial_epsilon=initial_exploration,
                           final_epsilon=final_exploration,
                           annealing_time=final_exploration_frame)
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return DQN(q,
                policy,
                replay_buffer,
                discount_factor=discount_factor,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
                minibatch_size=minibatch_size)
Beispiel #23
0
    def agent(self, writer=DummyWriter(), train_steps=float("inf")):
        # optimizers
        feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"])
        value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"])
        reward_optimizer = Adam(self.reward_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"])
        generator_optimizer = Adam(self.generator_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"])

        # approximators
        f = FeatureNetwork(self.feature_model, feature_optimizer, writer=writer)
        v = VNetwork(self.value_model, value_optimizer, writer=writer)
        r = QNetwork(self.reward_model, reward_optimizer, name="reward", writer=writer)
        g = Approximation(self.generator_model, generator_optimizer, name="generator", writer=writer)

        # replay buffer
        replay_buffer = ExperienceReplayBuffer(self.hyperparameters["replay_buffer_size"], device=self.device)

        # create agent
        agent = ModelBasedDQN(f, v, r, g, replay_buffer,
            minibatch_size=self.hyperparameters["minibatch_size"],
            replay_start_size=self.hyperparameters["replay_start_size"]
        )

        # apply atari wrappers for better performance
        return DeepmindAtariBody(agent, lazy_frames=True)
 def test_agent(self):
     q = QNetwork(copy.deepcopy(self.model))
     return VSarsaTestAgent(q, self.n_actions, exploration=self.hyperparameters['test_exploration'])
Beispiel #25
0
 def test_agent(self):
     q = QNetwork(copy.deepcopy(self.model))
     policy = GreedyPolicy(q,
                           self.n_actions,
                           epsilon=self.hyperparameters['test_exploration'])
     return DQNTestAgent(policy)
Beispiel #26
0
 def _sarsa(env, writer=DummyWriter()):
     model = fc_net(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QNetwork(model, optimizer, env.action_space.n, writer=writer)
     policy = GreedyPolicy(q, env.action_space.n, annealing_time=1, final_epsilon=epsilon)
     return Sarsa(q, policy)
Beispiel #27
0
    def test_target_net(self):
        torch.manual_seed(2)
        model = nn.Sequential(nn.Linear(1, 1))
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
        q = QNetwork(model, optimizer, target=FixedTarget(3))
        inputs = State(torch.tensor([1.]))

        def loss(policy_value):
            target = policy_value - 1
            return smooth_l1_loss(policy_value, target.detach())

        policy_value = q(inputs)
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value.item(), -0.008584141731262207)
        np.testing.assert_equal(target_value, -0.008584141731262207)

        q.reinforce(loss(policy_value))
        policy_value = q(inputs)
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value.item(), -0.20858412981033325)
        np.testing.assert_equal(target_value, -0.008584141731262207)

        q.reinforce(loss(policy_value))
        policy_value = q(inputs)
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value.item(), -0.4085841178894043)
        np.testing.assert_equal(target_value, -0.008584141731262207)

        q.reinforce(loss(policy_value))
        policy_value = q(inputs)
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value.item(), -0.6085841655731201)
        np.testing.assert_equal(target_value, -0.6085841655731201)

        q.reinforce(loss(policy_value))
        policy_value = q(inputs)
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value.item(), -0.8085841536521912)
        np.testing.assert_equal(target_value, -0.6085841655731201)
Beispiel #28
0
class TestQNetwork(unittest.TestCase):
    def setUp(self):
        torch.manual_seed(2)
        self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTIONS))

        def optimizer(params):
            return torch.optim.SGD(params, lr=0.1)

        self.q = QNetwork(self.model, optimizer)

    def test_eval_list(self):
        states = State(torch.randn(5, STATE_DIM),
                       mask=torch.tensor([1, 1, 0, 1, 0]))
        result = self.q.eval(states)
        tt.assert_almost_equal(result,
                               torch.tensor(
                                   [[-0.238509, -0.726287, -0.034026],
                                    [-0.35688755, -0.6612102, 0.34849477],
                                    [0., 0., 0.], [0.1944, -0.5536, -0.2345],
                                    [0., 0., 0.]]),
                               decimal=2)

    def test_eval_actions(self):
        states = State(torch.randn(3, STATE_DIM))
        actions = [1, 2, 0]
        result = self.q.eval(states, actions)
        self.assertEqual(result.shape, torch.Size([3]))
        tt.assert_almost_equal(
            result, torch.tensor([-0.7262873, 0.3484948, -0.0296164]))

    def test_target_net(self):
        torch.manual_seed(2)
        model = nn.Sequential(nn.Linear(1, 1))
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
        q = QNetwork(model, optimizer, target=FixedTarget(3))
        inputs = State(torch.tensor([1.]))

        def loss(policy_value):
            target = policy_value - 1
            return smooth_l1_loss(policy_value, target.detach())

        policy_value = q(inputs)
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value.item(), -0.008584141731262207)
        np.testing.assert_equal(target_value, -0.008584141731262207)

        q.reinforce(loss(policy_value))
        policy_value = q(inputs)
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value.item(), -0.20858412981033325)
        np.testing.assert_equal(target_value, -0.008584141731262207)

        q.reinforce(loss(policy_value))
        policy_value = q(inputs)
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value.item(), -0.4085841178894043)
        np.testing.assert_equal(target_value, -0.008584141731262207)

        q.reinforce(loss(policy_value))
        policy_value = q(inputs)
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value.item(), -0.6085841655731201)
        np.testing.assert_equal(target_value, -0.6085841655731201)

        q.reinforce(loss(policy_value))
        policy_value = q(inputs)
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value.item(), -0.8085841536521912)
        np.testing.assert_equal(target_value, -0.6085841655731201)
Beispiel #29
0
 def test_agent(self):
     f = FeatureNetwork(self.feature_model, None)
     v = VNetwork(self.value_model, None)
     r = QNetwork(self.reward_model, None)
     g = Approximation(self.generator_model, None)
     return DeepmindAtariBody(ModelBasedTestAgent(f, v, r, g, self.hyperparameters["discount_factor"]))