예제 #1
0
class TestGaussian(unittest.TestCase):
    def setUp(self):
        torch.manual_seed(2)
        self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTION_DIM * 2))
        optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01)
        self.policy = GaussianPolicy(self.model, optimizer, ACTION_DIM)

    def test_output_shape(self):
        state = State(torch.randn(1, STATE_DIM))
        action = self.policy(state)
        self.assertEqual(action.shape, (1, ACTION_DIM))
        state = State(torch.randn(5, STATE_DIM))
        action = self.policy(state)
        self.assertEqual(action.shape, (5, ACTION_DIM))

    def test_reinforce_one(self):
        state = State(torch.randn(1, STATE_DIM))
        self.policy(state)
        self.policy.reinforce(torch.tensor([1]).float())

    def test_converge(self):
        state = State(torch.randn(1, STATE_DIM))
        target = torch.tensor([1., 2., -1.])

        for _ in range(0, 1000):
            action = self.policy(state)
            loss = torch.abs(target - action).mean()
            self.policy.reinforce(-loss)

        self.assertTrue(loss < 1)
    def setUp(self):

        torch.manual_seed(2)
        self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]))
        self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTION_DIM * 2))
        optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01)
        self.policy = GaussianPolicy(self.model, optimizer, self.space)
예제 #3
0
    def _ppo(envs, writer=DummyWriter()):
        final_anneal_step = last_frame * epochs * minibatches / (n_steps *
                                                                 n_envs)
        env = envs[0]

        feature_model, value_model, policy_model = fc_actor_critic(env)
        feature_model.to(device)
        value_model.to(device)
        policy_model.to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer, final_anneal_step),
                                  writer=writer)
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step),
        )
        policy = GaussianPolicy(
            policy_model,
            policy_optimizer,
            env.action_space,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step),
        )

        return TimeFeature(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(clip_initial,
                                        clip_final,
                                        0,
                                        final_anneal_step,
                                        name='clip',
                                        writer=writer),
                epochs=epochs,
                minibatches=minibatches,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                lam=lam,
                entropy_loss_scaling=entropy_loss_scaling,
                writer=writer,
            ))
예제 #4
0
class TestGaussian(unittest.TestCase):
    def setUp(self):

        torch.manual_seed(2)
        self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]))
        self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTION_DIM * 2))
        optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01)
        self.policy = GaussianPolicy(self.model, optimizer, self.space)

    def test_output_shape(self):
        state = State(torch.randn(1, STATE_DIM))
        action = self.policy(state).sample()
        self.assertEqual(action.shape, (1, ACTION_DIM))
        state = State(torch.randn(5, STATE_DIM))
        action = self.policy(state).sample()
        self.assertEqual(action.shape, (5, ACTION_DIM))

    def test_reinforce_one(self):
        state = State(torch.randn(1, STATE_DIM))
        dist = self.policy(state)
        action = dist.sample()
        log_prob1 = dist.log_prob(action)
        loss = -log_prob1.mean()
        self.policy.reinforce(loss)

        dist = self.policy(state)
        log_prob2 = dist.log_prob(action)

        self.assertGreater(log_prob2.item(), log_prob1.item())

    def test_converge(self):
        state = State(torch.randn(1, STATE_DIM))
        target = torch.tensor([1., 2., -1.])

        for _ in range(0, 1000):
            dist = self.policy(state)
            action = dist.sample()
            log_prob = dist.log_prob(action)
            error = ((target - action)**2).mean()
            loss = (error * log_prob).mean()
            self.policy.reinforce(loss)

        self.assertTrue(error < 1)
예제 #5
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps * self.hyperparameters[
            'epochs'] * self.hyperparameters['minibatches'] / (
                self.hyperparameters['n_steps'] *
                self.hyperparameters['n_envs'])

        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters['lr'],
                               eps=self.hyperparameters['eps'])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters['lr'],
                                eps=self.hyperparameters['eps'])

        features = Identity(self.device)

        v = VNetwork(
            self.value_model,
            value_optimizer,
            loss_scaling=self.hyperparameters['value_loss_scaling'],
            clip_grad=self.hyperparameters['clip_grad'],
            writer=writer,
            scheduler=CosineAnnealingLR(value_optimizer, n_updates),
        )

        policy = GaussianPolicy(
            self.policy_model,
            policy_optimizer,
            self.action_space,
            clip_grad=self.hyperparameters['clip_grad'],
            writer=writer,
            scheduler=CosineAnnealingLR(policy_optimizer, n_updates),
        )

        return TimeFeature(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(self.hyperparameters['clip_initial'],
                                        self.hyperparameters['clip_final'],
                                        0,
                                        n_updates,
                                        name='clip',
                                        writer=writer),
                epochs=self.hyperparameters['epochs'],
                minibatches=self.hyperparameters['minibatches'],
                n_envs=self.hyperparameters['n_envs'],
                n_steps=self.hyperparameters['n_steps'],
                discount_factor=self.hyperparameters['discount_factor'],
                lam=self.hyperparameters['lam'],
                entropy_loss_scaling=self.
                hyperparameters['entropy_loss_scaling'],
                writer=writer,
            ))
예제 #6
0
class TestGaussian(unittest.TestCase):
    def setUp(self):
        torch.manual_seed(2)
        self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]))
        self.model = nn.Sequential(
            nn.Linear(STATE_DIM, ACTION_DIM * 2)
        )
        optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01)
        self.policy = GaussianPolicy(self.model, optimizer, self.space, checkpointer=DummyCheckpointer())

    def test_output_shape(self):
        state = State(torch.randn(1, STATE_DIM))
        action = self.policy(state).sample()
        self.assertEqual(action.shape, (1, ACTION_DIM))
        state = State(torch.randn(5, STATE_DIM))
        action = self.policy(state).sample()
        self.assertEqual(action.shape, (5, ACTION_DIM))

    def test_reinforce_one(self):
        state = State(torch.randn(1, STATE_DIM))
        dist = self.policy(state)
        action = dist.sample()
        log_prob1 = dist.log_prob(action)
        loss = -log_prob1.mean()
        self.policy.reinforce(loss)

        dist = self.policy(state)
        log_prob2 = dist.log_prob(action)

        self.assertGreater(log_prob2.item(), log_prob1.item())

    def test_converge(self):
        state = State(torch.randn(1, STATE_DIM))
        target = torch.tensor([1., 2., -1.])

        for _ in range(0, 1000):
            dist = self.policy(state)
            action = dist.sample()
            log_prob = dist.log_prob(action)
            error = ((target - action) ** 2).mean()
            loss = (error * log_prob).mean()
            self.policy.reinforce(loss)

        self.assertTrue(error < 1)

    def test_eval(self):
        state = State(torch.randn(1, STATE_DIM))
        dist = self.policy.no_grad(state)
        tt.assert_almost_equal(dist.mean, torch.tensor([[-0.237, 0.497, -0.058]]), decimal=3)
        tt.assert_almost_equal(dist.entropy(), torch.tensor([4.254]), decimal=3)
        best = self.policy.eval(state).sample()
        tt.assert_almost_equal(best, torch.tensor([[-0.888, -0.887, 0.404]]), decimal=3)
예제 #7
0
 def setUp(self):
     torch.manual_seed(2)
     self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTION_DIM * 2))
     optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01)
     self.policy = GaussianPolicy(self.model, optimizer, ACTION_DIM)
예제 #8
0
 def test_agent(self):
     policy = GaussianPolicy(copy.deepcopy(self.policy_model),
                             space=self.action_space)
     return TimeFeature(PPOTestAgent(Identity(self.device), policy))