class TestGaussian(unittest.TestCase): def setUp(self): torch.manual_seed(2) self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTION_DIM * 2)) optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01) self.policy = GaussianPolicy(self.model, optimizer, ACTION_DIM) def test_output_shape(self): state = State(torch.randn(1, STATE_DIM)) action = self.policy(state) self.assertEqual(action.shape, (1, ACTION_DIM)) state = State(torch.randn(5, STATE_DIM)) action = self.policy(state) self.assertEqual(action.shape, (5, ACTION_DIM)) def test_reinforce_one(self): state = State(torch.randn(1, STATE_DIM)) self.policy(state) self.policy.reinforce(torch.tensor([1]).float()) def test_converge(self): state = State(torch.randn(1, STATE_DIM)) target = torch.tensor([1., 2., -1.]) for _ in range(0, 1000): action = self.policy(state) loss = torch.abs(target - action).mean() self.policy.reinforce(-loss) self.assertTrue(loss < 1)
def setUp(self): torch.manual_seed(2) self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1])) self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTION_DIM * 2)) optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01) self.policy = GaussianPolicy(self.model, optimizer, self.space)
def _ppo(envs, writer=DummyWriter()): final_anneal_step = last_frame * epochs * minibatches / (n_steps * n_envs) env = envs[0] feature_model, value_model, policy_model = fc_actor_critic(env) feature_model.to(device) value_model.to(device) policy_model.to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step), writer=writer) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step), ) policy = GaussianPolicy( policy_model, policy_optimizer, env.action_space, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step), ) return TimeFeature( PPO( features, v, policy, epsilon=LinearScheduler(clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, entropy_loss_scaling=entropy_loss_scaling, writer=writer, ))
class TestGaussian(unittest.TestCase): def setUp(self): torch.manual_seed(2) self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1])) self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTION_DIM * 2)) optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01) self.policy = GaussianPolicy(self.model, optimizer, self.space) def test_output_shape(self): state = State(torch.randn(1, STATE_DIM)) action = self.policy(state).sample() self.assertEqual(action.shape, (1, ACTION_DIM)) state = State(torch.randn(5, STATE_DIM)) action = self.policy(state).sample() self.assertEqual(action.shape, (5, ACTION_DIM)) def test_reinforce_one(self): state = State(torch.randn(1, STATE_DIM)) dist = self.policy(state) action = dist.sample() log_prob1 = dist.log_prob(action) loss = -log_prob1.mean() self.policy.reinforce(loss) dist = self.policy(state) log_prob2 = dist.log_prob(action) self.assertGreater(log_prob2.item(), log_prob1.item()) def test_converge(self): state = State(torch.randn(1, STATE_DIM)) target = torch.tensor([1., 2., -1.]) for _ in range(0, 1000): dist = self.policy(state) action = dist.sample() log_prob = dist.log_prob(action) error = ((target - action)**2).mean() loss = (error * log_prob).mean() self.policy.reinforce(loss) self.assertTrue(error < 1)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps * self.hyperparameters[ 'epochs'] * self.hyperparameters['minibatches'] / ( self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) features = Identity(self.device) v = VNetwork( self.value_model, value_optimizer, loss_scaling=self.hyperparameters['value_loss_scaling'], clip_grad=self.hyperparameters['clip_grad'], writer=writer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), ) policy = GaussianPolicy( self.policy_model, policy_optimizer, self.action_space, clip_grad=self.hyperparameters['clip_grad'], writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, n_updates), ) return TimeFeature( PPO( features, v, policy, epsilon=LinearScheduler(self.hyperparameters['clip_initial'], self.hyperparameters['clip_final'], 0, n_updates, name='clip', writer=writer), epochs=self.hyperparameters['epochs'], minibatches=self.hyperparameters['minibatches'], n_envs=self.hyperparameters['n_envs'], n_steps=self.hyperparameters['n_steps'], discount_factor=self.hyperparameters['discount_factor'], lam=self.hyperparameters['lam'], entropy_loss_scaling=self. hyperparameters['entropy_loss_scaling'], writer=writer, ))
class TestGaussian(unittest.TestCase): def setUp(self): torch.manual_seed(2) self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1])) self.model = nn.Sequential( nn.Linear(STATE_DIM, ACTION_DIM * 2) ) optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01) self.policy = GaussianPolicy(self.model, optimizer, self.space, checkpointer=DummyCheckpointer()) def test_output_shape(self): state = State(torch.randn(1, STATE_DIM)) action = self.policy(state).sample() self.assertEqual(action.shape, (1, ACTION_DIM)) state = State(torch.randn(5, STATE_DIM)) action = self.policy(state).sample() self.assertEqual(action.shape, (5, ACTION_DIM)) def test_reinforce_one(self): state = State(torch.randn(1, STATE_DIM)) dist = self.policy(state) action = dist.sample() log_prob1 = dist.log_prob(action) loss = -log_prob1.mean() self.policy.reinforce(loss) dist = self.policy(state) log_prob2 = dist.log_prob(action) self.assertGreater(log_prob2.item(), log_prob1.item()) def test_converge(self): state = State(torch.randn(1, STATE_DIM)) target = torch.tensor([1., 2., -1.]) for _ in range(0, 1000): dist = self.policy(state) action = dist.sample() log_prob = dist.log_prob(action) error = ((target - action) ** 2).mean() loss = (error * log_prob).mean() self.policy.reinforce(loss) self.assertTrue(error < 1) def test_eval(self): state = State(torch.randn(1, STATE_DIM)) dist = self.policy.no_grad(state) tt.assert_almost_equal(dist.mean, torch.tensor([[-0.237, 0.497, -0.058]]), decimal=3) tt.assert_almost_equal(dist.entropy(), torch.tensor([4.254]), decimal=3) best = self.policy.eval(state).sample() tt.assert_almost_equal(best, torch.tensor([[-0.888, -0.887, 0.404]]), decimal=3)
def setUp(self): torch.manual_seed(2) self.model = nn.Sequential(nn.Linear(STATE_DIM, ACTION_DIM * 2)) optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01) self.policy = GaussianPolicy(self.model, optimizer, ACTION_DIM)
def test_agent(self): policy = GaussianPolicy(copy.deepcopy(self.policy_model), space=self.action_space) return TimeFeature(PPOTestAgent(Identity(self.device), policy))