def test_target(self): self.policy = DeterministicPolicy(self.model, self.optimizer, self.space, target=FixedTarget(3)) state = State(torch.ones(1, STATE_DIM)) # run update step, make sure target network doesn't change self.policy(state).sum().backward() self.policy.step() tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) # again... self.policy(state).sum().backward() self.policy.step() tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) # third time, target should be updated self.policy(state).sum().backward() self.policy.step() tt.assert_allclose( self.policy.target(state), torch.tensor([[-0.574482, -0.574482, -0.574482]]), atol=1e-4, )
def test_target(self): self.policy = DeterministicPolicy( self.model, self.optimizer, self.space, target=FixedTarget(3) ) # choose initial action state = State(torch.ones(1, STATE_DIM)) action = self.policy(state) tt.assert_equal(action, torch.zeros(1, ACTION_DIM)) # run update step, make sure target network doesn't change action.sum().backward(retain_graph=True) self.policy.step() tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) # again... action.sum().backward(retain_graph=True) self.policy.step() tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) # third time, target should be updated action.sum().backward(retain_graph=True) self.policy.step() tt.assert_allclose( self.policy.eval(state), torch.tensor([[-0.595883, -0.595883, -0.595883]]), atol=1e-4, )
def setUp(self): torch.manual_seed(2) self.model = nn.Sequential(nn.Linear0(STATE_DIM, ACTION_DIM)) self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01) self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1])) self.policy = DeterministicPolicy(self.model, self.optimizer, self.space, 0.5)
def test_clipping(self): space = Box(np.array([-0.1, -0.1, -0.1]), np.array([0.1, 0.1, 0.1])) self.policy = DeterministicPolicy(self.model, self.optimizer, space, 0.5) state = State(torch.randn(1, STATE_DIM)) action = self.policy(state).detach().numpy() np.testing.assert_array_almost_equal(action, np.array([[-0.1, -0.1, 0.1]]))
def test_agent(self): policy = DeterministicPolicy( copy.deepcopy(self.policy_model), None, self.action_space, ) return TimeFeature(DDPGTestAgent(policy))
def _ddpg(env, writer=DummyWriter()): value_model = fc_value(env).to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_q) q = QContinuous(value_model, value_optimizer, target=PolyakTarget(polyak_rate), writer=writer) policy_model = fc_policy(env).to(device) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) policy = DeterministicPolicy(policy_model, policy_optimizer, env.action_space, noise, target=PolyakTarget(polyak_rate), writer=writer) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DDPG(q, policy, replay_buffer, replay_start_size=replay_start_size, discount_factor=discount_factor, update_frequency=update_frequency, minibatch_size=minibatch_size)
def _online_cacla(env, writer=DummyWriter()): value_model = models.critic(env, hidden1=hidden1, hidden2=hidden2).to(device) policy_model = models.actor(env, hidden1=hidden1, hidden2=hidden2).to(device) # feature_model = models.features(env.state_space.shape[0]).to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) # feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps) # feature_optimizer = SGD(feature_model.parameters(), lr=lr_pi, momentum=0.9) policy = DeterministicPolicy( policy_model, policy_optimizer, env.action_space, quiet=not log, clip_grad=1.0, writer=writer, normalise_inputs=True, box=env.state_space, ) v = VNetwork( value_model, value_optimizer, quiet=not log, writer=writer, normalise_inputs=True, box=env.state_space, ) features = None # FeatureNetwork(feature_model, feature_optimizer, writer=writer, normalize_input=False) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) # TODO - reintroduce TimeFeature wrapper return OnlineCACLA(features, v, policy, replay_buffer, env.action_space, log=log, writer=writer, discount_factor=discount_factor)
def _fac(env, writer=DummyWriter()): value_model = models.critic(env, hidden1=hidden1, hidden2=hidden2).to(device) policy_model = models.actor(env, hidden1=hidden1, hidden2=hidden2).to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) policy = DeterministicPolicy( policy_model, policy_optimizer, env.action_space, quiet=not log, clip_grad=1.0, writer=writer, normalise_inputs=True, box=env.state_space, ) v = VNetwork( value_model, value_optimizer, quiet=not log, writer=writer, normalise_inputs=True, box=env.state_space, ) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) # TODO - reintroduce TimeFeature wrapper return ForwardAC(v, policy, replay_buffer, env.action_space, log=log, trace_decay=trace_decay, writer=writer, discount_factor=discount_factor)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters["replay_start_size"] ) / self.hyperparameters["update_frequency"] q_optimizer = Adam(self.q_model.parameters(), lr=self.hyperparameters["lr_q"]) q = QContinuous(self.q_model, q_optimizer, target=PolyakTarget( self.hyperparameters["polyak_rate"]), scheduler=CosineAnnealingLR(q_optimizer, n_updates), writer=writer) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"]) policy = DeterministicPolicy( self.policy_model, policy_optimizer, self.action_space, target=PolyakTarget(self.hyperparameters["polyak_rate"]), scheduler=CosineAnnealingLR(policy_optimizer, n_updates), writer=writer) replay_buffer = ExperienceReplayBuffer( self.hyperparameters["replay_buffer_size"], device=self.device) return TimeFeature( DDPG( q, policy, replay_buffer, self.action_space, noise=self.hyperparameters["noise"], replay_start_size=self.hyperparameters["replay_start_size"], discount_factor=self.hyperparameters["discount_factor"], update_frequency=self.hyperparameters["update_frequency"], minibatch_size=self.hyperparameters["minibatch_size"], ))
def _ddpg(env, writer=DummyWriter()): final_anneal_step = (last_frame - replay_start_size) // update_frequency q_model = fc_q(env).to(device) q_optimizer = Adam(q_model.parameters(), lr=lr_q) q = QContinuous(q_model, q_optimizer, target=PolyakTarget(polyak_rate), scheduler=CosineAnnealingLR(q_optimizer, final_anneal_step), writer=writer) policy_model = fc_deterministic_policy(env).to(device) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) policy = DeterministicPolicy(policy_model, policy_optimizer, env.action_space, target=PolyakTarget(polyak_rate), scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step), writer=writer) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return TimeFeature( DDPG( q, policy, replay_buffer, env.action_space, noise=noise, replay_start_size=replay_start_size, discount_factor=discount_factor, update_frequency=update_frequency, minibatch_size=minibatch_size, ))
class TestDeterministic(unittest.TestCase): def setUp(self): torch.manual_seed(2) self.model = nn.Sequential(nn.Linear0(STATE_DIM, ACTION_DIM)) self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01) self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1])) self.policy = DeterministicPolicy(self.model, self.optimizer, self.space, checkpointer=DummyCheckpointer()) def test_output_shape(self): state = State(torch.randn(1, STATE_DIM)) action = self.policy(state) self.assertEqual(action.shape, (1, ACTION_DIM)) state = State(torch.randn(5, STATE_DIM)) action = self.policy(state) self.assertEqual(action.shape, (5, ACTION_DIM)) def test_step_one(self): state = State(torch.randn(1, STATE_DIM)) self.policy(state) self.policy.step() def test_converge(self): state = State(torch.randn(1, STATE_DIM)) target = torch.tensor([0.25, 0.5, -0.5]) for _ in range(0, 200): action = self.policy(state) loss = ((target - action)**2).mean() loss.backward() self.policy.step() self.assertLess(loss, 0.001) def test_target(self): self.policy = DeterministicPolicy(self.model, self.optimizer, self.space, target=FixedTarget(3)) state = State(torch.ones(1, STATE_DIM)) # run update step, make sure target network doesn't change self.policy(state).sum().backward() self.policy.step() tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) # again... self.policy(state).sum().backward() self.policy.step() tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) # third time, target should be updated self.policy(state).sum().backward() self.policy.step() tt.assert_allclose( self.policy.target(state), torch.tensor([[-0.574482, -0.574482, -0.574482]]), atol=1e-4, )
class TestDeterministic(unittest.TestCase): def setUp(self): torch.manual_seed(2) self.model = nn.Sequential(nn.Linear0(STATE_DIM, ACTION_DIM)) self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01) self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1])) self.policy = DeterministicPolicy(self.model, self.optimizer, self.space, 0.5) def test_output_shape(self): state = State(torch.randn(1, STATE_DIM)) action = self.policy(state) self.assertEqual(action.shape, (1, ACTION_DIM)) state = State(torch.randn(5, STATE_DIM)) action = self.policy(state) self.assertEqual(action.shape, (5, ACTION_DIM)) def test_clipping(self): space = Box(np.array([-0.1, -0.1, -0.1]), np.array([0.1, 0.1, 0.1])) self.policy = DeterministicPolicy(self.model, self.optimizer, space, 0.5) state = State(torch.randn(1, STATE_DIM)) action = self.policy(state).detach().numpy() np.testing.assert_array_almost_equal(action, np.array([[-0.1, -0.1, 0.1]])) def test_step_one(self): state = State(torch.randn(1, STATE_DIM)) self.policy(state) self.policy.step() def test_converge(self): state = State(torch.randn(1, STATE_DIM)) target = torch.tensor([1., 2., -1.]) for _ in range(0, 100): action = self.policy.greedy(state) loss = torch.abs(target - action).mean() loss.backward() self.policy.step() self.assertTrue(loss < 0.1) def test_target(self): self.policy = DeterministicPolicy(self.model, self.optimizer, self.space, 0.5, target=FixedTarget(3)) # choose initial action state = State(torch.ones(1, STATE_DIM)) action = self.policy.greedy(state) tt.assert_equal(action, torch.zeros(1, ACTION_DIM)) # run update step, make sure target network doesn't change action.sum().backward(retain_graph=True) self.policy.step() tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) # again... action.sum().backward(retain_graph=True) self.policy.step() tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) # third time, target should be updated action.sum().backward(retain_graph=True) self.policy.step() tt.assert_allclose(self.policy.eval(state), torch.tensor([[-0.686739, -0.686739, -0.686739]]))