def test_target(self):
        self.policy = DeterministicPolicy(self.model,
                                          self.optimizer,
                                          self.space,
                                          target=FixedTarget(3))
        state = State(torch.ones(1, STATE_DIM))

        # run update step, make sure target network doesn't change
        self.policy(state).sum().backward()
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # again...
        self.policy(state).sum().backward()
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # third time, target should be updated
        self.policy(state).sum().backward()
        self.policy.step()
        tt.assert_allclose(
            self.policy.target(state),
            torch.tensor([[-0.574482, -0.574482, -0.574482]]),
            atol=1e-4,
        )
    def test_target(self):
        self.policy = DeterministicPolicy(
            self.model,
            self.optimizer,
            self.space,
            target=FixedTarget(3)
        )

        # choose initial action
        state = State(torch.ones(1, STATE_DIM))
        action = self.policy(state)
        tt.assert_equal(action, torch.zeros(1, ACTION_DIM))

        # run update step, make sure target network doesn't change
        action.sum().backward(retain_graph=True)
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # again...
        action.sum().backward(retain_graph=True)
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # third time, target should be updated
        action.sum().backward(retain_graph=True)
        self.policy.step()
        tt.assert_allclose(
            self.policy.eval(state),
            torch.tensor([[-0.595883, -0.595883, -0.595883]]),
            atol=1e-4,
        )
 def setUp(self):
     torch.manual_seed(2)
     self.model = nn.Sequential(nn.Linear0(STATE_DIM, ACTION_DIM))
     self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01)
     self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]))
     self.policy = DeterministicPolicy(self.model, self.optimizer,
                                       self.space, 0.5)
 def test_clipping(self):
     space = Box(np.array([-0.1, -0.1, -0.1]), np.array([0.1, 0.1, 0.1]))
     self.policy = DeterministicPolicy(self.model, self.optimizer, space,
                                       0.5)
     state = State(torch.randn(1, STATE_DIM))
     action = self.policy(state).detach().numpy()
     np.testing.assert_array_almost_equal(action,
                                          np.array([[-0.1, -0.1, 0.1]]))
예제 #5
0
 def test_agent(self):
     policy = DeterministicPolicy(
         copy.deepcopy(self.policy_model),
         None,
         self.action_space,
     )
     return TimeFeature(DDPGTestAgent(policy))
예제 #6
0
    def _ddpg(env, writer=DummyWriter()):
        value_model = fc_value(env).to(device)
        value_optimizer = Adam(value_model.parameters(), lr=lr_q)
        q = QContinuous(value_model,
                        value_optimizer,
                        target=PolyakTarget(polyak_rate),
                        writer=writer)

        policy_model = fc_policy(env).to(device)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
        policy = DeterministicPolicy(policy_model,
                                     policy_optimizer,
                                     env.action_space,
                                     noise,
                                     target=PolyakTarget(polyak_rate),
                                     writer=writer)

        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        return DDPG(q,
                    policy,
                    replay_buffer,
                    replay_start_size=replay_start_size,
                    discount_factor=discount_factor,
                    update_frequency=update_frequency,
                    minibatch_size=minibatch_size)
예제 #7
0
    def _online_cacla(env, writer=DummyWriter()):
        value_model = models.critic(env, hidden1=hidden1,
                                    hidden2=hidden2).to(device)
        policy_model = models.actor(env, hidden1=hidden1,
                                    hidden2=hidden2).to(device)
        # feature_model = models.features(env.state_space.shape[0]).to(device)

        value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps)
        # feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps)
        # feature_optimizer = SGD(feature_model.parameters(), lr=lr_pi, momentum=0.9)

        policy = DeterministicPolicy(
            policy_model,
            policy_optimizer,
            env.action_space,
            quiet=not log,
            clip_grad=1.0,
            writer=writer,
            normalise_inputs=True,
            box=env.state_space,
        )

        v = VNetwork(
            value_model,
            value_optimizer,
            quiet=not log,
            writer=writer,
            normalise_inputs=True,
            box=env.state_space,
        )
        features = None  # FeatureNetwork(feature_model, feature_optimizer, writer=writer, normalize_input=False)
        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        # TODO - reintroduce TimeFeature wrapper
        return OnlineCACLA(features,
                           v,
                           policy,
                           replay_buffer,
                           env.action_space,
                           log=log,
                           writer=writer,
                           discount_factor=discount_factor)
예제 #8
0
    def _fac(env, writer=DummyWriter()):
        value_model = models.critic(env, hidden1=hidden1,
                                    hidden2=hidden2).to(device)
        policy_model = models.actor(env, hidden1=hidden1,
                                    hidden2=hidden2).to(device)

        value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps)

        policy = DeterministicPolicy(
            policy_model,
            policy_optimizer,
            env.action_space,
            quiet=not log,
            clip_grad=1.0,
            writer=writer,
            normalise_inputs=True,
            box=env.state_space,
        )

        v = VNetwork(
            value_model,
            value_optimizer,
            quiet=not log,
            writer=writer,
            normalise_inputs=True,
            box=env.state_space,
        )
        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        # TODO - reintroduce TimeFeature wrapper
        return ForwardAC(v,
                         policy,
                         replay_buffer,
                         env.action_space,
                         log=log,
                         trace_decay=trace_decay,
                         writer=writer,
                         discount_factor=discount_factor)
예제 #9
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters["replay_start_size"]
                     ) / self.hyperparameters["update_frequency"]

        q_optimizer = Adam(self.q_model.parameters(),
                           lr=self.hyperparameters["lr_q"])

        q = QContinuous(self.q_model,
                        q_optimizer,
                        target=PolyakTarget(
                            self.hyperparameters["polyak_rate"]),
                        scheduler=CosineAnnealingLR(q_optimizer, n_updates),
                        writer=writer)

        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr_pi"])
        policy = DeterministicPolicy(
            self.policy_model,
            policy_optimizer,
            self.action_space,
            target=PolyakTarget(self.hyperparameters["polyak_rate"]),
            scheduler=CosineAnnealingLR(policy_optimizer, n_updates),
            writer=writer)

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters["replay_buffer_size"], device=self.device)

        return TimeFeature(
            DDPG(
                q,
                policy,
                replay_buffer,
                self.action_space,
                noise=self.hyperparameters["noise"],
                replay_start_size=self.hyperparameters["replay_start_size"],
                discount_factor=self.hyperparameters["discount_factor"],
                update_frequency=self.hyperparameters["update_frequency"],
                minibatch_size=self.hyperparameters["minibatch_size"],
            ))
예제 #10
0
    def _ddpg(env, writer=DummyWriter()):
        final_anneal_step = (last_frame -
                             replay_start_size) // update_frequency

        q_model = fc_q(env).to(device)
        q_optimizer = Adam(q_model.parameters(), lr=lr_q)
        q = QContinuous(q_model,
                        q_optimizer,
                        target=PolyakTarget(polyak_rate),
                        scheduler=CosineAnnealingLR(q_optimizer,
                                                    final_anneal_step),
                        writer=writer)

        policy_model = fc_deterministic_policy(env).to(device)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
        policy = DeterministicPolicy(policy_model,
                                     policy_optimizer,
                                     env.action_space,
                                     target=PolyakTarget(polyak_rate),
                                     scheduler=CosineAnnealingLR(
                                         policy_optimizer, final_anneal_step),
                                     writer=writer)

        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        return TimeFeature(
            DDPG(
                q,
                policy,
                replay_buffer,
                env.action_space,
                noise=noise,
                replay_start_size=replay_start_size,
                discount_factor=discount_factor,
                update_frequency=update_frequency,
                minibatch_size=minibatch_size,
            ))
class TestDeterministic(unittest.TestCase):
    def setUp(self):
        torch.manual_seed(2)
        self.model = nn.Sequential(nn.Linear0(STATE_DIM, ACTION_DIM))
        self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01)
        self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]))
        self.policy = DeterministicPolicy(self.model,
                                          self.optimizer,
                                          self.space,
                                          checkpointer=DummyCheckpointer())

    def test_output_shape(self):
        state = State(torch.randn(1, STATE_DIM))
        action = self.policy(state)
        self.assertEqual(action.shape, (1, ACTION_DIM))
        state = State(torch.randn(5, STATE_DIM))
        action = self.policy(state)
        self.assertEqual(action.shape, (5, ACTION_DIM))

    def test_step_one(self):
        state = State(torch.randn(1, STATE_DIM))
        self.policy(state)
        self.policy.step()

    def test_converge(self):
        state = State(torch.randn(1, STATE_DIM))
        target = torch.tensor([0.25, 0.5, -0.5])

        for _ in range(0, 200):
            action = self.policy(state)
            loss = ((target - action)**2).mean()
            loss.backward()
            self.policy.step()

        self.assertLess(loss, 0.001)

    def test_target(self):
        self.policy = DeterministicPolicy(self.model,
                                          self.optimizer,
                                          self.space,
                                          target=FixedTarget(3))
        state = State(torch.ones(1, STATE_DIM))

        # run update step, make sure target network doesn't change
        self.policy(state).sum().backward()
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # again...
        self.policy(state).sum().backward()
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # third time, target should be updated
        self.policy(state).sum().backward()
        self.policy.step()
        tt.assert_allclose(
            self.policy.target(state),
            torch.tensor([[-0.574482, -0.574482, -0.574482]]),
            atol=1e-4,
        )
class TestDeterministic(unittest.TestCase):
    def setUp(self):
        torch.manual_seed(2)
        self.model = nn.Sequential(nn.Linear0(STATE_DIM, ACTION_DIM))
        self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01)
        self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]))
        self.policy = DeterministicPolicy(self.model, self.optimizer,
                                          self.space, 0.5)

    def test_output_shape(self):
        state = State(torch.randn(1, STATE_DIM))
        action = self.policy(state)
        self.assertEqual(action.shape, (1, ACTION_DIM))
        state = State(torch.randn(5, STATE_DIM))
        action = self.policy(state)
        self.assertEqual(action.shape, (5, ACTION_DIM))

    def test_clipping(self):
        space = Box(np.array([-0.1, -0.1, -0.1]), np.array([0.1, 0.1, 0.1]))
        self.policy = DeterministicPolicy(self.model, self.optimizer, space,
                                          0.5)
        state = State(torch.randn(1, STATE_DIM))
        action = self.policy(state).detach().numpy()
        np.testing.assert_array_almost_equal(action,
                                             np.array([[-0.1, -0.1, 0.1]]))

    def test_step_one(self):
        state = State(torch.randn(1, STATE_DIM))
        self.policy(state)
        self.policy.step()

    def test_converge(self):
        state = State(torch.randn(1, STATE_DIM))
        target = torch.tensor([1., 2., -1.])

        for _ in range(0, 100):
            action = self.policy.greedy(state)
            loss = torch.abs(target - action).mean()
            loss.backward()
            self.policy.step()

        self.assertTrue(loss < 0.1)

    def test_target(self):
        self.policy = DeterministicPolicy(self.model,
                                          self.optimizer,
                                          self.space,
                                          0.5,
                                          target=FixedTarget(3))

        # choose initial action
        state = State(torch.ones(1, STATE_DIM))
        action = self.policy.greedy(state)
        tt.assert_equal(action, torch.zeros(1, ACTION_DIM))

        # run update step, make sure target network doesn't change
        action.sum().backward(retain_graph=True)
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # again...
        action.sum().backward(retain_graph=True)
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # third time, target should be updated
        action.sum().backward(retain_graph=True)
        self.policy.step()
        tt.assert_allclose(self.policy.eval(state),
                           torch.tensor([[-0.686739, -0.686739, -0.686739]]))