class TestExperienceReplayBuffer(unittest.TestCase): def setUp(self): np.random.seed(1) random.seed(1) torch.manual_seed(1) self.replay_buffer = ExperienceReplayBuffer(5) def test_run(self): states = torch.arange(0, 20) actions = torch.arange(0, 20) rewards = torch.arange(0, 20) expected_samples = torch.tensor([[0, 0, 0], [1, 1, 0], [0, 1, 1], [3, 0, 0], [1, 4, 4], [1, 2, 4], [2, 4, 3], [4, 7, 4], [7, 4, 6], [6, 5, 6]]) expected_weights = np.ones((10, 3)) actual_samples = [] actual_weights = [] for i in range(10): state = State(states[i].unsqueeze(0), torch.tensor([1])) next_state = State(states[i + 1].unsqueeze(0), torch.tensor([1])) self.replay_buffer.store(state, actions[i], rewards[i], next_state) sample = self.replay_buffer.sample(3) actual_samples.append(sample[0].features) actual_weights.append(sample[-1]) tt.assert_equal( torch.cat(actual_samples).view(expected_samples.shape), expected_samples) np.testing.assert_array_equal(expected_weights, np.vstack(actual_weights))
def agent(self, writer=DummyWriter(), train_steps=float('inf')): optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr']) q = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget( self.hyperparameters['target_update_frequency']), writer=writer, ) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device) return C51(q, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, self.hyperparameters["final_exploration_step"] - self.hyperparameters["replay_start_size"], name="epsilon", writer=writer, ), discount_factor=self.hyperparameters["discount_factor"], minibatch_size=self.hyperparameters["minibatch_size"], replay_start_size=self.hyperparameters["replay_start_size"], update_frequency=self.hyperparameters["update_frequency"], writer=writer)
def _dqn(env, writer=DummyWriter()): _model = nature_dqn(env).to(device) _optimizer = Adam(_model.parameters(), lr=lr, eps=eps) q = QNetwork(_model, _optimizer, env.action_space.n, target=FixedTarget(target_update_frequency), loss=smooth_l1_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), )
def _dqn(env, writer=DummyWriter()): model = fc_relu_q(env).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QNetwork(model, optimizer, env.action_space.n, target=FixedTarget(target_update_frequency), loss=mse_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DQN(q, policy, replay_buffer, discount_factor=discount_factor, replay_start_size=replay_start_size, update_frequency=update_frequency, minibatch_size=minibatch_size)
def _c51(env, writer=DummyWriter()): model = fc_relu_dist_q(env, atoms=atoms).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QDist( model, optimizer, env.action_space.n, atoms, v_min=v_min, v_max=v_max, writer=writer, ) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return C51(q, replay_buffer, exploration=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer, ), discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer)
def test_store_device(self): if torch.cuda.is_available(): self.replay_buffer = ExperienceReplayBuffer(5, device='cuda', store_device='cpu') states = torch.arange(0, 20).to('cuda') actions = torch.arange(0, 20).view((-1, 1)).to('cuda') rewards = torch.arange(0, 20).to('cuda') state = State(states[0]) next_state = State(states[1], reward=rewards[1]) self.replay_buffer.store(state, actions[0], next_state) sample = self.replay_buffer.sample(3) self.assertEqual(sample[0].device, torch.device('cuda')) self.assertEqual(self.replay_buffer.buffer[0][0].device, torch.device('cpu'))
def _dqn(env, writer=DummyWriter()): _model = model _optimizer = optimizer if _model is None: _model = conv_net(env, frames=agent_history_length).to(device) if _optimizer is None: _optimizer = Adam(_model.parameters(), lr=lr, eps=eps) q = QNetwork(_model, _optimizer, env.action_space.n, target_update_frequency=target_update_frequency, loss=smooth_l1_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, annealing_start=replay_start_size, annealing_time=final_exploration_frame - replay_start_size, initial_epsilon=initial_exploration, final_epsilon=final_exploration) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DeepmindAtariBody(DQN( q, policy, replay_buffer, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), env, action_repeat=action_repeat, frame_stack=agent_history_length, noop_max=noop_max)
def _ddqn(env, writer=DummyWriter()): action_repeat = 1 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency final_exploration_step = final_exploration_frame / action_repeat model = model_constructor(env).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QNetwork(model, optimizer, scheduler=CosineAnnealingLR(optimizer, last_update), target=FixedTarget(target_update_frequency), writer=writer) policy = SharedAutonomyPolicy(q, env.action_space.n, epsilon=0, pilot_tol=pilot_tol) if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=alpha, beta=beta, device=device) else: replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return co_DDQN(q, policy, replay_buffer, loss=weighted_smooth_l1_loss, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr']) q = QNetwork(self.model, optimizer, target=FixedTarget( self.hyperparameters['target_update_frequency']), writer=writer) policy = GreedyPolicy( q, self.n_actions, epsilon=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], self.hyperparameters['replay_start_size'], self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], name="exploration", writer=writer)) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device) return DQN( q, policy, replay_buffer, discount_factor=self.hyperparameters['discount_factor'], minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], )
def _model_predictive_dqn(env, writer=None): # models feature_model = shared_feature_layers().to(device) value_model = value_head().to(device) reward_model = reward_head(env).to(device) generator_model = Generator(env).to(device) # optimizers feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) reward_optimizer = Adam(reward_model.parameters(), lr=lr, eps=eps) generator_optimizer = Adam(generator_model.parameters(), lr=lr, eps=eps) # approximators f = FeatureNetwork(feature_model, feature_optimizer, writer=writer) v = VNetwork(value_model, value_optimizer, writer=writer) r = QNetwork(reward_model, reward_optimizer, name='reward', writer=writer) g = Approximation(generator_model, generator_optimizer, name='generator', writer=writer) # replay buffer replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) # create agent agent = ModelPredictiveDQN(f, v, r, g, replay_buffer, minibatch_size=minibatch_size, replay_start_size=replay_start_size ) # apply agent wrappers for better atari performance return DeepmindAtariBody(agent, lazy_frames=True)
def _ddpg(env, writer=DummyWriter()): value_model = fc_value(env).to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_q) q = QContinuous(value_model, value_optimizer, target=PolyakTarget(polyak_rate), writer=writer) policy_model = fc_policy(env).to(device) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) policy = DeterministicPolicy(policy_model, policy_optimizer, env.action_space, noise, target=PolyakTarget(polyak_rate), writer=writer) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DDPG(q, policy, replay_buffer, replay_start_size=replay_start_size, discount_factor=discount_factor, update_frequency=update_frequency, minibatch_size=minibatch_size)
def _dqn(env, writer=DummyWriter()): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency final_exploration_step = final_exploration_frame / action_repeat model = nature_dqn(env).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = QNetwork( model, optimizer, scheduler=CosineAnnealingLR(optimizer, last_update), target=FixedTarget(target_update_frequency), writer=writer ) policy = GreedyPolicy( q, env.action_space.n, epsilon=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_step - replay_start_size, name="epsilon", writer=writer ) ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device ) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=discount_factor, loss=smooth_l1_loss, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), lazy_frames=True )
def test_run(self): np.random.seed(1) random.seed(1) torch.manual_seed(1) self.replay_buffer = ExperienceReplayBuffer(5) states = torch.arange(0, 20) actions = torch.arange(0, 20).view((-1, 1)) rewards = torch.arange(0, 20) expected_samples = torch.tensor([ [0, 0, 0], [1, 1, 0], [0, 1, 1], [3, 0, 0], [1, 4, 4], [1, 2, 4], [2, 4, 3], [4, 7, 4], [7, 4, 6], [6, 5, 6], ]) expected_weights = np.ones((10, 3)) actual_samples = [] actual_weights = [] for i in range(10): state = State(states[i]) next_state = State(states[i + 1], reward=rewards[i]) self.replay_buffer.store(state, actions[i], next_state) sample = self.replay_buffer.sample(3) actual_samples.append(sample[0].observation) actual_weights.append(sample[-1]) tt.assert_equal( torch.cat(actual_samples).view(expected_samples.shape), expected_samples) np.testing.assert_array_equal(expected_weights, np.vstack(actual_weights))
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q = QNetwork( self.model, optimizer, scheduler=CosineAnnealingLR(optimizer, n_updates), target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer ) policy = GreedyPolicy( q, self.n_actions, epsilon=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], self.hyperparameters['replay_start_size'], self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], name="exploration", writer=writer ) ) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device ) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=self.hyperparameters['discount_factor'], loss=smooth_l1_loss, minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], ), lazy_frames=True )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget(self.hyperparameters['target_update_frequency']), scheduler=CosineAnnealingLR(optimizer, n_updates), writer=writer, ) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device ) return DeepmindAtariBody( C51( q, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, self.hyperparameters["final_exploration_step"] - self.hyperparameters["replay_start_size"], name="epsilon", writer=writer, ), discount_factor=self.hyperparameters["discount_factor"], minibatch_size=self.hyperparameters["minibatch_size"], replay_start_size=self.hyperparameters["replay_start_size"], update_frequency=self.hyperparameters["update_frequency"], writer=writer ), lazy_frames=True, episodic_lives=True )
class TestExperienceReplayBuffer(unittest.TestCase): def test_run(self): np.random.seed(1) random.seed(1) torch.manual_seed(1) self.replay_buffer = ExperienceReplayBuffer(5) states = torch.arange(0, 20) actions = torch.arange(0, 20).view((-1, 1)) rewards = torch.arange(0, 20) expected_samples = torch.tensor([ [0, 0, 0], [1, 1, 0], [0, 1, 1], [3, 0, 0], [1, 4, 4], [1, 2, 4], [2, 4, 3], [4, 7, 4], [7, 4, 6], [6, 5, 6], ]) expected_weights = np.ones((10, 3)) actual_samples = [] actual_weights = [] for i in range(10): state = State(states[i]) next_state = State(states[i + 1], reward=rewards[i]) self.replay_buffer.store(state, actions[i], next_state) sample = self.replay_buffer.sample(3) actual_samples.append(sample[0].observation) actual_weights.append(sample[-1]) tt.assert_equal( torch.cat(actual_samples).view(expected_samples.shape), expected_samples) np.testing.assert_array_equal(expected_weights, np.vstack(actual_weights)) def test_store_device(self): if torch.cuda.is_available(): self.replay_buffer = ExperienceReplayBuffer(5, device='cuda', store_device='cpu') states = torch.arange(0, 20).to('cuda') actions = torch.arange(0, 20).view((-1, 1)).to('cuda') rewards = torch.arange(0, 20).to('cuda') state = State(states[0]) next_state = State(states[1], reward=rewards[1]) self.replay_buffer.store(state, actions[0], next_state) sample = self.replay_buffer.sample(3) self.assertEqual(sample[0].device, torch.device('cuda')) self.assertEqual(self.replay_buffer.buffer[0][0].device, torch.device('cpu'))
def _c51(env, writer=DummyWriter()): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency model = nature_c51(env, atoms=atoms).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = QDist( model, optimizer, env.action_space.n, atoms, v_min=v_min, v_max=v_max, target=FixedTarget(target_update_frequency), scheduler=CosineAnnealingLR(optimizer, last_update), writer=writer, ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device ) return DeepmindAtariBody( C51( q, replay_buffer, exploration=LinearScheduler( initial_exploration, final_exploration, 0, last_timestep, name="epsilon", writer=writer, ), discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer ), lazy_frames=True )
def _online_cacla(env, writer=DummyWriter()): value_model = models.critic(env, hidden1=hidden1, hidden2=hidden2).to(device) policy_model = models.actor(env, hidden1=hidden1, hidden2=hidden2).to(device) # feature_model = models.features(env.state_space.shape[0]).to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) # feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps) # feature_optimizer = SGD(feature_model.parameters(), lr=lr_pi, momentum=0.9) policy = DeterministicPolicy( policy_model, policy_optimizer, env.action_space, quiet=not log, clip_grad=1.0, writer=writer, normalise_inputs=True, box=env.state_space, ) v = VNetwork( value_model, value_optimizer, quiet=not log, writer=writer, normalise_inputs=True, box=env.state_space, ) features = None # FeatureNetwork(feature_model, feature_optimizer, writer=writer, normalize_input=False) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) # TODO - reintroduce TimeFeature wrapper return OnlineCACLA(features, v, policy, replay_buffer, env.action_space, log=log, writer=writer, discount_factor=discount_factor)
def _sac(env, writer=DummyWriter()): q_1_model = fc_q(env).to(device) q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q) q_1 = QContinuous(q_1_model, q_1_optimizer, writer=writer, name='q_1') q_2_model = fc_q(env).to(device) q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q) q_2 = QContinuous(q_2_model, q_2_optimizer, writer=writer, name='q_2') v_model = fc_v(env).to(device) v_optimizer = Adam(v_model.parameters(), lr=lr_v) v = VNetwork( v_model, v_optimizer, target=PolyakTarget(polyak_rate), writer=writer, name='v', ) policy_model = fc_policy(env).to(device) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) policy = SoftDeterministicPolicy(policy_model, policy_optimizer, env.action_space, writer=writer) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return SAC(policy, q_1, q_2, v, replay_buffer, entropy_target=(-env.action_space.shape[0] * entropy_target_scaling), lr_temperature=lr_temperature, replay_start_size=replay_start_size, discount_factor=discount_factor, update_frequency=update_frequency, minibatch_size=minibatch_size, writer=writer)
def _fac(env, writer=DummyWriter()): value_model = models.critic(env, hidden1=hidden1, hidden2=hidden2).to(device) policy_model = models.actor(env, hidden1=hidden1, hidden2=hidden2).to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) policy = DeterministicPolicy( policy_model, policy_optimizer, env.action_space, quiet=not log, clip_grad=1.0, writer=writer, normalise_inputs=True, box=env.state_space, ) v = VNetwork( value_model, value_optimizer, quiet=not log, writer=writer, normalise_inputs=True, box=env.state_space, ) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) # TODO - reintroduce TimeFeature wrapper return ForwardAC(v, policy, replay_buffer, env.action_space, log=log, trace_decay=trace_decay, writer=writer, discount_factor=discount_factor)
def _c51(env, writer=DummyWriter()): model = nature_c51(env, atoms=51).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = QDist( model, optimizer, env.action_space.n, atoms, v_min=v_min, v_max=v_max, target=FixedTarget(target_update_frequency), writer=writer, ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device ) return DeepmindAtariBody( C51( q, replay_buffer, exploration=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer, ), discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer ) )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters["replay_start_size"] ) / self.hyperparameters["update_frequency"] q_optimizer = Adam(self.q_model.parameters(), lr=self.hyperparameters["lr_q"]) q = QContinuous(self.q_model, q_optimizer, target=PolyakTarget( self.hyperparameters["polyak_rate"]), scheduler=CosineAnnealingLR(q_optimizer, n_updates), writer=writer) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"]) policy = DeterministicPolicy( self.policy_model, policy_optimizer, self.action_space, target=PolyakTarget(self.hyperparameters["polyak_rate"]), scheduler=CosineAnnealingLR(policy_optimizer, n_updates), writer=writer) replay_buffer = ExperienceReplayBuffer( self.hyperparameters["replay_buffer_size"], device=self.device) return TimeFeature( DDPG( q, policy, replay_buffer, self.action_space, noise=self.hyperparameters["noise"], replay_start_size=self.hyperparameters["replay_start_size"], discount_factor=self.hyperparameters["discount_factor"], update_frequency=self.hyperparameters["update_frequency"], minibatch_size=self.hyperparameters["minibatch_size"], ))
def _ddpg(env, writer=DummyWriter()): final_anneal_step = (last_frame - replay_start_size) // update_frequency q_model = fc_q(env).to(device) q_optimizer = Adam(q_model.parameters(), lr=lr_q) q = QContinuous(q_model, q_optimizer, target=PolyakTarget(polyak_rate), scheduler=CosineAnnealingLR(q_optimizer, final_anneal_step), writer=writer) policy_model = fc_deterministic_policy(env).to(device) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) policy = DeterministicPolicy(policy_model, policy_optimizer, env.action_space, target=PolyakTarget(polyak_rate), scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step), writer=writer) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return TimeFeature( DDPG( q, policy, replay_buffer, env.action_space, noise=noise, replay_start_size=replay_start_size, discount_factor=discount_factor, update_frequency=update_frequency, minibatch_size=minibatch_size, ))
def _dqn(env, writer=DummyWriter()): model = build_model(env).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QNetwork(model, optimizer, env.action_space.n, target_update_frequency=target_update_frequency, loss=mse_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, initial_epsilon=initial_exploration, final_epsilon=final_exploration, annealing_time=final_exploration_frame) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DQN(q, policy, replay_buffer, discount_factor=discount_factor, replay_start_size=replay_start_size, update_frequency=update_frequency, minibatch_size=minibatch_size)
def agent(self, writer=DummyWriter(), train_steps=float("inf")): # optimizers feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) reward_optimizer = Adam(self.reward_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) generator_optimizer = Adam(self.generator_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) # approximators f = FeatureNetwork(self.feature_model, feature_optimizer, writer=writer) v = VNetwork(self.value_model, value_optimizer, writer=writer) r = QNetwork(self.reward_model, reward_optimizer, name="reward", writer=writer) g = Approximation(self.generator_model, generator_optimizer, name="generator", writer=writer) # replay buffer replay_buffer = ExperienceReplayBuffer(self.hyperparameters["replay_buffer_size"], device=self.device) # create agent agent = ModelBasedDQN(f, v, r, g, replay_buffer, minibatch_size=self.hyperparameters["minibatch_size"], replay_start_size=self.hyperparameters["replay_start_size"] ) # apply atari wrappers for better performance return DeepmindAtariBody(agent, lazy_frames=True)
def _dqn(env, writers=None): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency final_exploration_step = final_exploration_frame / action_repeat n_agents = len(env.agents) n_actions = env.action_spaces['first_0'].n model = model_constructor(env).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = Approximation( model, optimizer, scheduler=CosineAnnealingLR(optimizer, last_update), target=FixedTarget(target_update_frequency), writer=writers['first_0'] ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, store_device=device, device=device ) def agent_constructor(writer): policy = GreedyPolicy( q, n_actions, epsilon=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_step - replay_start_size, name="epsilon", writer=writer ) ) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=discount_factor, loss=smooth_l1_loss, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), lazy_frames=True ) return MultiagentEncoder(IndependentMultiagent({ agent : agent_constructor(writers[agent]) for agent in env.agents }), env.agents, device)
def _sac(env, writer=DummyWriter()): final_anneal_step = (last_frame - replay_start_size) // update_frequency v_model = v_model_constructor(env).to(device) q_1_model = q1_model_constructor(env).to(device) q_2_model = q2_model_constructor(env).to(device) #quick and dirty implementation of parallel branch un/freeze policy_model = policy_model_constructor( env=env, train_parallel=train_parallel).to(device) if pretrained_models is not None: q_1_model = pretrained_models.q_1.model.to(device) q_2_model = pretrained_models.q_2.model.to(device) v_model = pretrained_models.v.model.to(device) policy_model = pretrained_models.policy.model.to(device) q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q) q_1 = QContinuousCtrlRep(q_1_model, q_1_optimizer, scheduler=CosineAnnealingLR( q_1_optimizer, final_anneal_step), target=FixedTarget(1000), writer=writer, name='q_1') q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q) q_2 = QContinuousCtrlRep(q_2_model, q_2_optimizer, scheduler=CosineAnnealingLR( q_2_optimizer, final_anneal_step), target=FixedTarget(1000), writer=writer, name='q_2') v_optimizer = Adam(v_model.parameters(), lr=lr_v) v = VNetworkCtrlRep( v_model, v_optimizer, scheduler=CosineAnnealingLR(v_optimizer, final_anneal_step), target=PolyakTarget(polyak_rate), writer=writer, name='v', ) policy_optimizer = Adam(filter(lambda p: p.requires_grad, policy_model.parameters()), lr=lr_pi) policy = SoftDeterministicPolicyCtrlRep(policy_model, policy_optimizer, env.action_space, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step), target=FixedTarget(1000), writer=writer) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return TimeFeature( SACCtrlRep(policy=policy, q_1=q_1, q_2=q_2, v=v, replay_buffer=replay_buffer, temperature_initial=temperature_initial, entropy_target=(-env.action_space.shape[0] * entropy_target_scaling), lr_temperature=lr_temperature, replay_start_size=replay_start_size, discount_factor=discount_factor, update_frequency=update_frequency, minibatch_size=minibatch_size, writer=writer))
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters["replay_start_size"] ) / self.hyperparameters["update_frequency"] q_1_optimizer = Adam(self.q_1_model.parameters(), lr=self.hyperparameters["lr_q"]) q_1 = QContinuous(self.q_1_model, q_1_optimizer, scheduler=CosineAnnealingLR(q_1_optimizer, n_updates), writer=writer, name='q_1') q_2_optimizer = Adam(self.q_2_model.parameters(), lr=self.hyperparameters["lr_q"]) q_2 = QContinuous(self.q_2_model, q_2_optimizer, scheduler=CosineAnnealingLR(q_2_optimizer, n_updates), writer=writer, name='q_2') v_optimizer = Adam(self.v_model.parameters(), lr=self.hyperparameters["lr_v"]) v = VNetwork( self.v_model, v_optimizer, scheduler=CosineAnnealingLR(v_optimizer, n_updates), target=PolyakTarget(self.hyperparameters["polyak_rate"]), writer=writer, name='v', ) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"]) policy = SoftDeterministicPolicy(self.policy_model, policy_optimizer, self.action_space, scheduler=CosineAnnealingLR( policy_optimizer, n_updates), writer=writer) replay_buffer = ExperienceReplayBuffer( self.hyperparameters["replay_buffer_size"], device=self.device) return TimeFeature( SAC(policy, q_1, q_2, v, replay_buffer, temperature_initial=self. hyperparameters["temperature_initial"], entropy_target=( -self.action_space.shape[0] * self.hyperparameters["entropy_target_scaling"]), lr_temperature=self.hyperparameters["lr_temperature"], replay_start_size=self.hyperparameters["replay_start_size"], discount_factor=self.hyperparameters["discount_factor"], update_frequency=self.hyperparameters["update_frequency"], minibatch_size=self.hyperparameters["minibatch_size"], writer=writer))
def setUp(self): np.random.seed(1) random.seed(1) torch.manual_seed(1) self.replay_buffer = ExperienceReplayBuffer(5)
def setUp(self): np.random.seed(1) random.seed(1) torch.manual_seed(1) self.replay_buffer = NStepReplayBuffer(4, 0.5, ExperienceReplayBuffer(100))