def _model_predictive_dqn(env, writer=None): # models feature_model = shared_feature_layers().to(device) value_model = value_head().to(device) reward_model = reward_head(env).to(device) generator_model = Generator(env).to(device) # optimizers feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) reward_optimizer = Adam(reward_model.parameters(), lr=lr, eps=eps) generator_optimizer = Adam(generator_model.parameters(), lr=lr, eps=eps) # approximators f = FeatureNetwork(feature_model, feature_optimizer, writer=writer) v = VNetwork(value_model, value_optimizer, writer=writer) r = QNetwork(reward_model, reward_optimizer, name='reward', writer=writer) g = Approximation(generator_model, generator_optimizer, name='generator', writer=writer) # replay buffer replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) # create agent agent = ModelPredictiveDQN(f, v, r, g, replay_buffer, minibatch_size=minibatch_size, replay_start_size=replay_start_size ) # apply agent wrappers for better atari performance return DeepmindAtariBody(agent, lazy_frames=True)
def _vpg(env, writer=DummyWriter()): feature_model = feature_model_constructor(env).to(device) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork( feature_model, feature_optimizer, writer=writer ) v = VNetwork( value_model, value_optimizer, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, writer=writer ) return VPG(features, v, policy, discount_factor=discount_factor, min_batch_size=min_batch_size)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) features = FeatureNetwork(self.feature_model, feature_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) v = VNetwork(self.value_model, value_optimizer, loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) return VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"])
def _vac(envs, writer=DummyWriter()): value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(envs[0]).to(device) feature_model = feature_model_constructor().to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer, ) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer) return DeepmindAtariBody( VAC(features, v, policy, discount_factor=discount_factor), )
def _a2c(envs, writer=DummyWriter()): env = envs[0] feature_model = feature_model_constructor(env).to(device) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad) v = VNetwork( value_model, value_optimizer, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer ) return A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=entropy_loss_scaling, writer=writer )
def _ppo(envs, writer=DummyWriter()): env = envs[0] feature_model = fc_relu_features(env).to(device) value_model = fc_value_head().to(device) policy_model = fc_policy_head(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad) v = VNetwork(value_model, value_optimizer, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer) return PPO(features, v, policy, epsilon=epsilon, epochs=epochs, lam=lam, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=entropy_loss_scaling, writer=writer)
def _vpg(env, writer=DummyWriter()): feature_model = fc_relu_features(env).to(device) value_model = fc_value_head().to(device) policy_model = fc_policy_head(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer) v = VNetwork(value_model, value_optimizer, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer) return VPG(features, v, policy, gamma=gamma, min_batch_size=min_batch_size)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps / self.hyperparameters["min_batch_size"] feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) features = FeatureNetwork( self.feature_model, feature_optimizer, scheduler=CosineAnnealingLR(feature_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer ) v = VNetwork( self.value_model, value_optimizer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer ) policy = SoftmaxPolicy( self.policy_model, policy_optimizer, scheduler=CosineAnnealingLR(policy_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer ) return DeepmindAtariBody( VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"]), )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"]) features = FeatureNetwork(self.feature_model, feature_optimizer, clip_grad=self.hyperparameters["clip_grad"]) v = VNetwork(self.value_model, value_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) return A2C( features, v, policy, n_envs=self.hyperparameters["n_envs"], n_steps=self.hyperparameters["n_steps"], discount_factor=self.hyperparameters["discount_factor"], entropy_loss_scaling=self.hyperparameters["entropy_loss_scaling"], writer=writer)
def _ppo(envs, writer=DummyWriter()): env = envs[0] # Update epoch * minibatches times per update, # but we only update once per n_steps, # with n_envs and 4 frames per step final_anneal_step = last_frame * epochs * minibatches / (n_steps * n_envs * 4) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_model = feature_model_constructor().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step), writer=writer) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step), ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step), ) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler(clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, entropy_loss_scaling=entropy_loss_scaling, writer=writer, ))
def _a2c(envs, writer=DummyWriter()): env = envs[0] feature_model = conv_features().to(device) value_model = value_net().to(device) policy_model = policy_net(env).to(device) feature_optimizer = RMSprop( feature_model.parameters(), alpha=alpha, lr=lr * feature_lr_scaling, eps=eps ) value_optimizer = RMSprop( value_model.parameters(), alpha=alpha, lr=lr, eps=eps ) policy_optimizer = RMSprop( policy_model.parameters(), alpha=alpha, lr=lr, eps=eps ) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad ) v = ValueNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer, ) return ParallelAtariBody( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, ), envs, )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps * self.hyperparameters[ 'epochs'] * self.hyperparameters['minibatches'] / ( self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) features = FeatureNetwork(self.feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer) v = VNetwork(self.value_model, value_optimizer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler(self.hyperparameters["clip_initial"], self.hyperparameters["clip_final"], 0, n_updates, name='clip', writer=writer), epochs=self.hyperparameters["epochs"], minibatches=self.hyperparameters["minibatches"], n_envs=self.hyperparameters["n_envs"], n_steps=self.hyperparameters["n_steps"], discount_factor=self.hyperparameters["discount_factor"], lam=self.hyperparameters["lam"], entropy_loss_scaling=self. hyperparameters["entropy_loss_scaling"], writer=writer, ))
def _ppo(envs, writer=DummyWriter()): final_anneal_step = last_frame * epochs * minibatches / (n_steps * n_envs) env = envs[0] feature_model, value_model, policy_model = fc_actor_critic(env) feature_model.to(device) value_model.to(device) policy_model.to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step), writer=writer) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step), ) policy = GaussianPolicy( policy_model, policy_optimizer, env.action_space, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step), ) return TimeFeature( PPO( features, v, policy, epsilon=LinearScheduler(clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, entropy_loss_scaling=entropy_loss_scaling, writer=writer, ))
def _a2c(envs, writer=DummyWriter()): env = envs[0] final_anneal_step = last_frame / (n_steps * n_envs) value_model = value_head().to(device) policy_model = policy_head(env).to(device) feature_model = conv_features().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork( feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) v = VNetwork( value_model, value_optimizer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, ), loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) return FrameStack( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=LinearScheduler(entropy_loss_scaling, 0., 0, final_anneal_step, name="entropy_loss_scaling", writer=writer), writer=writer ), size=4 )
def _a2c(envs, writer=DummyWriter()): env = envs[0] final_anneal_step = last_frame / (n_steps * n_envs * 4) value_model = nature_value_head().to(device) policy_model = nature_policy_head(env).to(device) feature_model = nature_features().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork( feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) v = VNetwork( value_model, value_optimizer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, ), loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) return DeepmindAtariBody( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=entropy_loss_scaling, writer=writer ), )
def _vac(env, writer=DummyWriter()): value_model = fc_value_head().to(device) policy_model = fc_policy_head(env).to(device) feature_model = fc_relu_features(env).to(device) value_optimizer = RMSprop(value_model.parameters(), lr=lr_v, alpha=alpha, eps=eps) policy_optimizer = RMSprop(policy_model.parameters(), lr=lr_pi, alpha=alpha, eps=eps) feature_optimizer = RMSprop(feature_model.parameters(), lr=lr_pi, alpha=alpha, eps=eps) v = VNetwork(value_model, value_optimizer, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, env.action_space.n, writer=writer) features = FeatureNetwork(feature_model, feature_optimizer) return VAC(features, v, policy, gamma=discount_factor)
def _vac(env, writer=DummyWriter()): value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_model = feature_model_constructor(env).to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps) v = VNetwork(value_model, value_optimizer, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, writer=writer) features = FeatureNetwork(feature_model, feature_optimizer) return VAC(features, v, policy, discount_factor=discount_factor)
def _a2c(envs, writer=DummyWriter()): env = envs[0] value_model = nature_value_head().to(device) policy_model = nature_policy_head(envs[0]).to(device) feature_model = nature_features().to(device) feature_optimizer = RMSprop( feature_model.parameters(), alpha=alpha, lr=lr, eps=eps ) value_optimizer = RMSprop(value_model.parameters(), alpha=alpha, lr=lr, eps=eps) policy_optimizer = RMSprop( policy_model.parameters(), alpha=alpha, lr=lr, eps=eps ) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer ) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer ) return DeepmindAtariBody( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, ), )
def _vpg_atari(env, writer=DummyWriter()): feature_model = nature_features().to(device) value_model = nature_value_head().to(device) policy_model = nature_policy_head(env).to(device) feature_optimizer = RMSprop(feature_model.parameters(), alpha=alpha, lr=lr * feature_lr_scaling, eps=eps) value_optimizer = RMSprop(value_model.parameters(), alpha=alpha, lr=lr, eps=eps) policy_optimizer = RMSprop(policy_model.parameters(), alpha=alpha, lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer) v = VNetwork(value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer, ) return DeepmindAtariBody( VPG(features, v, policy, gamma=discount_factor, min_batch_size=min_batch_size), )
def _vpg_atari(env, writer=DummyWriter()): value_model = nature_value_head().to(device) policy_model = nature_policy_head(env).to(device) feature_model = nature_features().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer) v = VNetwork(value_model, value_optimizer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, ), loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer) return DeepmindAtariBody(VPG(features, v, policy, discount_factor=discount_factor, min_batch_size=min_batch_size), episodic_lives=True)
def agent(self, writer=DummyWriter(), train_steps=float("inf")): # optimizers feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) reward_optimizer = Adam(self.reward_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) generator_optimizer = Adam(self.generator_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) # approximators f = FeatureNetwork(self.feature_model, feature_optimizer, writer=writer) v = VNetwork(self.value_model, value_optimizer, writer=writer) r = QNetwork(self.reward_model, reward_optimizer, name="reward", writer=writer) g = Approximation(self.generator_model, generator_optimizer, name="generator", writer=writer) # replay buffer replay_buffer = ExperienceReplayBuffer(self.hyperparameters["replay_buffer_size"], device=self.device) # create agent agent = ModelBasedDQN(f, v, r, g, replay_buffer, minibatch_size=self.hyperparameters["minibatch_size"], replay_start_size=self.hyperparameters["replay_start_size"] ) # apply atari wrappers for better performance return DeepmindAtariBody(agent, lazy_frames=True)
def test_agent(self): features = FeatureNetwork(copy.deepcopy(self.feature_model)) policy = SoftmaxPolicy(copy.deepcopy(self.policy_model)) return DeepmindAtariBody(VACTestAgent(features, policy))
class DiversityLearner: def __init__( self, model_fn, model_features, logger, device, num_targets, max_learn_steps, num_actions, obs_preproc, discount_factor=0.99, entropy_target=-2, lr_value=1e-3, lr_pi=1e-4, # Training settings polyak_rate=0.005, # Replay Buffer settings replay_start_size=5000, replay_buffer_size=1e6, # Exploration settings temperature_initial=0.1, lr_temperature=1e-5, entropy_target_scaling=1., ): self.writer = writer = DummyWriter() eps = 1e-5 self.discount_factor = discount_factor self.entropy_target = entropy_target self.temperature = temperature_initial self.lr_temperature = lr_temperature self.logger = logger self.device = device self.num_targets = num_targets self.max_learn_steps = max_learn_steps self.num_actions = num_actions final_anneal_step = (max_learn_steps) self.policy = DiversityPolicy(model_fn, model_features, num_actions, num_targets, obs_preproc, device) self.policy = self.policy.to(device) self.obs_preproc = obs_preproc policy_optimizer = Adam(self.policy.parameters(), lr=lr_pi, eps=eps) self.policy_learner = SoftmaxPolicy(self.policy, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step), writer=writer) value_feature_model = model_fn().to(device) q_models = [ DuelingQValueLayer(model_features, num_targets, num_actions).to(device) for i in range(2) ] v_model = ValueLayer(model_features, num_targets, num_actions).to(device) feature_optimizer = Adam(value_feature_model.parameters(), lr=lr_value, eps=eps) q_optimizers = [ Adam(q_models[i].parameters(), lr=lr_value, eps=eps) for i in range(2) ] v_optimizer = Adam(v_model.parameters(), lr=lr_value, eps=eps) self.features = FeatureNetwork( value_feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), # clip_grad=clip_grad, writer=writer) self.qs = [ QContinuous(q_models[i], q_optimizers[i], scheduler=CosineAnnealingLR(q_optimizers[i], final_anneal_step), writer=writer, name=f'q_{i}') for i in range(2) ] self.v = VNetwork( v_model, v_optimizer, scheduler=CosineAnnealingLR(v_optimizer, final_anneal_step), target=PolyakTarget(polyak_rate), writer=writer, name='v', ) def learn_step(self, idxs, transition_batch, weights): Otm1, targ_vec, old_action, env_rew, done, Ot = transition_batch batch_size = len(Ot) obsm1 = self.obs_preproc(torch.tensor(Otm1, device=self.device)) targ_vec = torch.tensor(targ_vec, device=self.device) actions = torch.tensor(old_action, device=self.device) rewards = torch.tensor(env_rew, device=self.device) done = torch.tensor(done, device=self.device).float().to(self.device) next_obs = self.obs_preproc(torch.tensor(Ot, device=self.device)) weights = torch.tensor(weights, device=self.device) # assert (not (Otm1 == Ot).all()) # print(self.device) states = StateArray( { 'observation': obsm1, 'reward': rewards, 'done': done, }, shape=(batch_size, )) # print(states['mask']) next_states = StateArray( { 'observation': obsm1, 'reward': torch.zeros(batch_size, device=self.device), 'done': torch.zeros(batch_size, device=self.device), 'mask': torch.ones(batch_size, device=self.device), }, shape=(batch_size, )) # prediction_reward = self.predictor(Ot) * targ_vec with torch.no_grad(): distribution = self.policy_learner(states) _log_probs = distribution.log_prob(actions).detach().squeeze() value_feature1 = self.features(states) value_feature2 = self.features(next_states) _actions = distribution.sample() #torch.argmax(_log_probs, axis=-1) q_targets = rewards + self.discount_factor * self.v.target( value_feature2).detach() # print(value_feature1) v_targets = torch.min( self.qs[0].target(value_feature1, _actions), self.qs[1].target(value_feature1, _actions), ) - self.temperature * _log_probs # update Q and V-functions # print(q_targets.min(),torch.min( # self.qs[0].target(value_feature1, _actions), # self.qs[1].target(value_feature1, _actions), # )) for i in range(2): self.qs[i].reinforce( mse_loss(self.qs[i](value_feature1, actions), q_targets)) # print(self.v(value_feature1).shape) # print(v_targets.shape) self.v.reinforce(mse_loss(self.v(value_feature1), v_targets)) # update policy distribution = self.policy_learner(states) _actions2 = distribution.sample() _log_probs2 = distribution.log_prob(_actions2).squeeze() loss = (-self.qs[0](value_feature1, _actions2).detach() + self.temperature * _log_probs2).mean() self.policy_learner.reinforce(loss) self.features.reinforce() self.qs[0].zero_grad() # adjust temperature temperature_grad = (_log_probs + self.entropy_target).mean() self.temperature += self.lr_temperature * temperature_grad.detach( ).cpu().numpy()
def __init__( self, model_fn, model_features, logger, device, num_targets, max_learn_steps, num_actions, obs_preproc, discount_factor=0.99, entropy_target=-2, lr_value=1e-3, lr_pi=1e-4, # Training settings polyak_rate=0.005, # Replay Buffer settings replay_start_size=5000, replay_buffer_size=1e6, # Exploration settings temperature_initial=0.1, lr_temperature=1e-5, entropy_target_scaling=1., ): self.writer = writer = DummyWriter() eps = 1e-5 self.discount_factor = discount_factor self.entropy_target = entropy_target self.temperature = temperature_initial self.lr_temperature = lr_temperature self.logger = logger self.device = device self.num_targets = num_targets self.max_learn_steps = max_learn_steps self.num_actions = num_actions final_anneal_step = (max_learn_steps) self.policy = DiversityPolicy(model_fn, model_features, num_actions, num_targets, obs_preproc, device) self.policy = self.policy.to(device) self.obs_preproc = obs_preproc policy_optimizer = Adam(self.policy.parameters(), lr=lr_pi, eps=eps) self.policy_learner = SoftmaxPolicy(self.policy, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step), writer=writer) value_feature_model = model_fn().to(device) q_models = [ DuelingQValueLayer(model_features, num_targets, num_actions).to(device) for i in range(2) ] v_model = ValueLayer(model_features, num_targets, num_actions).to(device) feature_optimizer = Adam(value_feature_model.parameters(), lr=lr_value, eps=eps) q_optimizers = [ Adam(q_models[i].parameters(), lr=lr_value, eps=eps) for i in range(2) ] v_optimizer = Adam(v_model.parameters(), lr=lr_value, eps=eps) self.features = FeatureNetwork( value_feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), # clip_grad=clip_grad, writer=writer) self.qs = [ QContinuous(q_models[i], q_optimizers[i], scheduler=CosineAnnealingLR(q_optimizers[i], final_anneal_step), writer=writer, name=f'q_{i}') for i in range(2) ] self.v = VNetwork( v_model, v_optimizer, scheduler=CosineAnnealingLR(v_optimizer, final_anneal_step), target=PolyakTarget(polyak_rate), writer=writer, name='v', )
def test_agent(self): features = FeatureNetwork(copy.deepcopy(self.feature_model)) policy = SoftmaxPolicy(copy.deepcopy(self.policy_model)) return VPGTestAgent(features, policy)
class NStepAdvantageBufferTest(unittest.TestCase): def setUp(self): torch.manual_seed(1) self.features = FeatureNetwork(nn.Linear(1, 2), None) self.v = VNetwork(nn.Linear(2, 1), None) def _compute_expected_advantages(self, states, returns, next_states, lengths): return (returns + (0.5**lengths) * self.v.eval(self.features.eval(next_states)) - self.v.eval(self.features.eval(states))) def test_rollout(self): buffer = NStepAdvantageBuffer(self.v, self.features, 2, 3, discount_factor=0.5) actions = torch.ones((3)) states = State(torch.arange(0, 12).unsqueeze(1)) buffer.store(states[0:3], actions, torch.zeros(3)) buffer.store(states[3:6], actions, torch.ones(3)) states, _, advantages = buffer.advantages(states[6:9]) expected_states = State(torch.arange(0, 6).unsqueeze(1)) expected_next_states = State( torch.cat((torch.arange(6, 9), torch.arange(6, 9))).unsqueeze(1)) expected_returns = torch.tensor([0.5, 0.5, 0.5, 1, 1, 1]).float() expected_lengths = torch.tensor([2., 2, 2, 1, 1, 1]) self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages(expected_states, expected_returns, expected_next_states, expected_lengths)) def test_rollout_with_nones(self): buffer = NStepAdvantageBuffer(self.v, self.features, 3, 3, discount_factor=0.5) done = torch.ones(12) done[5] = 0 done[7] = 0 done[9] = 0 states = State(torch.arange(0, 12).unsqueeze(1), done) actions = torch.ones((3)) buffer.store(states[0:3], actions, torch.zeros(3)) buffer.store(states[3:6], actions, torch.ones(3)) buffer.store(states[6:9], actions, 2 * torch.ones(3)) states, actions, advantages = buffer.advantages(states[9:12]) expected_states = State(torch.arange(0, 9).unsqueeze(1), done[0:9]) expected_next_done = torch.zeros(9) expected_next_done[5] = 1 expected_next_done[7] = 1 expected_next_done[8] = 1 expected_next_states = State( torch.tensor([9, 7, 5, 9, 7, 11, 9, 10, 11]).unsqueeze(1), expected_next_done) expected_returns = torch.tensor([1, 0.5, 0, 2, 1, 2, 2, 2, 2]).float() expected_lengths = torch.tensor([3, 2, 1, 2, 1, 2, 1, 1, 1]).float() self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages(expected_states, expected_returns, expected_next_states, expected_lengths)) def test_multi_rollout(self): buffer = NStepAdvantageBuffer(self.v, self.features, 2, 2, discount_factor=0.5) raw_states = State(torch.arange(0, 12).unsqueeze(1)) actions = torch.ones((2)) buffer.store(raw_states[0:2], actions, torch.ones(2)) buffer.store(raw_states[2:4], actions, torch.ones(2)) states, actions, advantages = buffer.advantages(raw_states[4:6]) expected_states = State(torch.arange(0, 4).unsqueeze(1)) expected_returns = torch.tensor([1.5, 1.5, 1, 1]) expected_next_states = State(torch.tensor([4, 5, 4, 5]).unsqueeze(1)) expected_lengths = torch.tensor([2., 2, 1, 1]) self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages(expected_states, expected_returns, expected_next_states, expected_lengths)) buffer.store(raw_states[4:6], actions, torch.ones(2)) buffer.store(raw_states[6:8], actions, torch.ones(2)) states, actions, advantages = buffer.advantages(raw_states[8:10]) expected_states = State(torch.arange(4, 8).unsqueeze(1)) self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages( expected_states, torch.tensor([1.5, 1.5, 1, 1]), State(torch.tensor([8, 9, 8, 9]).unsqueeze(1)), torch.tensor([2., 2, 1, 1]))) def assert_array_equal(self, actual, expected): for i, exp in enumerate(expected): self.assertEqual(actual[i], exp, msg=(("\nactual: %s\nexpected: %s") % (actual, expected))) def assert_states_equal(self, actual, expected): tt.assert_almost_equal(actual.raw, expected.raw) tt.assert_equal(actual.mask, expected.mask)
def setUp(self): torch.manual_seed(1) self.features = FeatureNetwork(nn.Linear(1, 2), None) self.v = VNetwork(nn.Linear(2, 1), None)
class GeneralizedAdvantageBufferTest(unittest.TestCase): def setUp(self): torch.manual_seed(1) self.features = FeatureNetwork(nn.Linear(1, 2), None) self.v = VNetwork(nn.Linear(2, 1), None) def _compute_expected_advantages(self, states, returns, next_states, lengths): return (returns + (0.5**lengths) * self.v.eval(self.features.eval(next_states)) - self.v.eval(self.features.eval(states))) def test_simple(self): buffer = GeneralizedAdvantageBuffer(self.v, self.features, 2, 1, discount_factor=0.5, lam=0.5) actions = torch.ones((1)) states = State(torch.arange(0, 3).unsqueeze(1)) rewards = torch.tensor([1., 2, 4]) buffer.store(states[0], actions, rewards[0]) buffer.store(states[1], actions, rewards[1]) values = self.v.eval(self.features.eval(states)) tt.assert_almost_equal(values, torch.tensor([0.1826, -0.3476, -0.8777]), decimal=3) td_errors = torch.zeros(2) td_errors[0] = rewards[0] + 0.5 * values[1] - values[0] td_errors[1] = rewards[1] + 0.5 * values[2] - values[1] tt.assert_almost_equal(td_errors, torch.tensor([0.6436, 1.909]), decimal=3) advantages = torch.zeros(2) advantages[0] = td_errors[0] + 0.25 * td_errors[1] advantages[1] = td_errors[1] tt.assert_almost_equal(advantages, torch.tensor([1.121, 1.909]), decimal=3) _states, _actions, _advantages = buffer.advantages(states[2]) tt.assert_almost_equal(_advantages, advantages) tt.assert_equal(_actions, torch.tensor([1, 1])) def test_parallel(self): buffer = GeneralizedAdvantageBuffer(self.v, self.features, 2, 2, discount_factor=0.5, lam=0.5) actions = torch.ones((2)) states = [ State(torch.tensor([[0], [3]])), State(torch.tensor([[1], [4]])), State(torch.tensor([[2], [5]])), ] rewards = torch.tensor([[1., 1], [2, 1], [4, 1]]) buffer.store(states[0], actions, rewards[0]) buffer.store(states[1], actions, rewards[1]) values = self.v.eval(self.features.eval(State.from_list(states))).view( 3, -1) tt.assert_almost_equal(values, torch.tensor([[0.183, -1.408], [-0.348, -1.938], [-0.878, -2.468]]), decimal=3) td_errors = torch.zeros(2, 2) td_errors[0] = rewards[0] + 0.5 * values[1] - values[0] td_errors[1] = rewards[1] + 0.5 * values[2] - values[1] tt.assert_almost_equal(td_errors, torch.tensor([[0.6436, 1.439], [1.909, 1.704]]), decimal=3) advantages = torch.zeros(2, 2) advantages[0] = td_errors[0] + 0.25 * td_errors[1] advantages[1] = td_errors[1] tt.assert_almost_equal(advantages, torch.tensor([[1.121, 1.865], [1.909, 1.704]]), decimal=3) _states, _actions, _advantages = buffer.advantages(states[2]) tt.assert_almost_equal(_advantages, advantages.view(-1)) def assert_array_equal(self, actual, expected): for i, exp in enumerate(expected): self.assertEqual(actual[i], exp, msg=(("\nactual: %s\nexpected: %s") % (actual, expected))) def assert_states_equal(self, actual, expected): tt.assert_almost_equal(actual.raw, expected.raw) tt.assert_equal(actual.mask, expected.mask)
def _ppo(envs, writer=DummyWriter()): env = envs[0] value_model = nature_value_head().to(device) policy_model = nature_policy_head(envs[0]).to(device) feature_model = nature_features().to(device) feature_optimizer = Adam( feature_model.parameters(), lr=lr, eps=eps ) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, eta_min=lr * min_lr_scale ), writer=writer ) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, eta_min=lr * min_lr_scale ), ) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, eta_min=lr * min_lr_scale ), ) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler( clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer ), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, ) )
def test_agent(self): f = FeatureNetwork(self.feature_model, None) v = VNetwork(self.value_model, None) r = QNetwork(self.reward_model, None) g = Approximation(self.generator_model, None) return DeepmindAtariBody(ModelBasedTestAgent(f, v, r, g, self.hyperparameters["discount_factor"]))