def _vqn(envs, writer=DummyWriter()): env = envs[0] model = nature_ddqn(env).to(device) optimizer = RMSprop(model.parameters(), lr=lr, alpha=alpha, eps=eps) q = QNetwork( model, optimizer, env.action_space.n, loss=smooth_l1_loss, writer=writer ) policy = GreedyPolicy( q, env.action_space.n, epsilon=LinearScheduler( initial_exploration, final_exploration, 0, final_exploration_frame, name="epsilon", writer=writer ) ) return DeepmindAtariBody( VQN(q, policy, gamma=discount_factor), )
def __init__( self, model, optimizer=None, checkpointer=None, clip_grad=0, device=None, loss_scaling=1, name='approximation', scheduler=None, target=None, writer=DummyWriter(), ): self.model = model self.device = device if device else next(model.parameters()).device self._target = target or TrivialTarget() self._scheduler = scheduler self._target.init(model) self._updates = 0 self._optimizer = optimizer self._loss_scaling = loss_scaling self._cache = [] self._clip_grad = clip_grad self._writer = writer self._name = name if checkpointer is None: checkpointer = DummyCheckpointer() self._checkpointer = checkpointer self._checkpointer.init( self.model, os.path.join(writer.log_dir, name + '.pt') )
def _dqn(env, writer=DummyWriter()): _model = nature_dqn(env).to(device) _optimizer = Adam(_model.parameters(), lr=lr, eps=eps) q = QNetwork(_model, _optimizer, env.action_space.n, target=FixedTarget(target_update_frequency), loss=smooth_l1_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), )
def _ddpg(env, writer=DummyWriter()): value_model = fc_value(env).to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_q) q = QContinuous(value_model, value_optimizer, target=PolyakTarget(polyak_rate), writer=writer) policy_model = fc_policy(env).to(device) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) policy = DeterministicPolicy(policy_model, policy_optimizer, env.action_space, noise, target=PolyakTarget(polyak_rate), writer=writer) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DDPG(q, policy, replay_buffer, replay_start_size=replay_start_size, discount_factor=discount_factor, update_frequency=update_frequency, minibatch_size=minibatch_size)
def _rainbow(env, writer=DummyWriter()): model = model_constructor(env, atoms=atoms, sigma=sigma).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QDist( model, optimizer, env.action_space.n, atoms, v_min=v_min, v_max=v_max, writer=writer, ) # replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) replay_buffer = PrioritizedReplayBuffer( replay_buffer_size, alpha=alpha, beta=beta, device=device ) replay_buffer = NStepReplayBuffer(n_steps, discount_factor, replay_buffer) return Rainbow( q, replay_buffer, exploration=0., discount_factor=discount_factor ** n_steps, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer, )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps / self.hyperparameters['n_envs'] optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q = QNetwork( self.model, optimizer, scheduler=CosineAnnealingLR(optimizer, n_updates), writer=writer ) policy = ParallelGreedyPolicy( q, self.n_actions, epsilon=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, self.hyperparameters["final_exploration_step"] / self.hyperparameters["n_envs"], name="exploration", writer=writer ) ) return VQN(q, policy, discount_factor=self.hyperparameters['discount_factor'])
def _vsarsa(envs, writer=DummyWriter()): env = envs[0] model = model_constructor(env).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QNetwork(model, optimizer, writer=writer) policy = ParallelGreedyPolicy(q, env.action_space.n, epsilon=epsilon) return VSarsa(q, policy, discount_factor=discount_factor)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps / self.hyperparameters["min_batch_size"] feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) features = FeatureNetwork( self.feature_model, feature_optimizer, scheduler=CosineAnnealingLR(feature_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer ) v = VNetwork( self.value_model, value_optimizer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer ) policy = SoftmaxPolicy( self.policy_model, policy_optimizer, scheduler=CosineAnnealingLR(policy_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer ) return DeepmindAtariBody( VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"]), )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"]) features = FeatureNetwork(self.feature_model, feature_optimizer, clip_grad=self.hyperparameters["clip_grad"]) v = VNetwork(self.value_model, value_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) return A2C( features, v, policy, n_envs=self.hyperparameters["n_envs"], n_steps=self.hyperparameters["n_steps"], discount_factor=self.hyperparameters["discount_factor"], entropy_loss_scaling=self.hyperparameters["entropy_loss_scaling"], writer=writer)
def _vqn(envs, writer=DummyWriter()): env = envs[0] model = fc_relu_q(env).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QNetwork(model, optimizer, writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=epsilon) return VQN(q, policy, discount_factor=discount_factor)
def _vpg(env, writer=DummyWriter()): feature_model = feature_model_constructor(env).to(device) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork( feature_model, feature_optimizer, writer=writer ) v = VNetwork( value_model, value_optimizer, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, writer=writer ) return VPG(features, v, policy, discount_factor=discount_factor, min_batch_size=min_batch_size)
def _ddqn(env, writer=DummyWriter()): model = model_constructor(env).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QNetwork(model, optimizer, target=FixedTarget(target_update_frequency), writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=alpha, beta=beta, device=device) return DDQN(q, policy, replay_buffer, discount_factor=discount_factor, replay_start_size=replay_start_size, update_frequency=update_frequency, minibatch_size=minibatch_size)
def _vac(envs, writer=DummyWriter()): value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(envs[0]).to(device) feature_model = feature_model_constructor().to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer, ) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer) return DeepmindAtariBody( VAC(features, v, policy, discount_factor=discount_factor), )
def _vpg(env, writer=DummyWriter()): feature_model = fc_relu_features(env).to(device) value_model = fc_value_head().to(device) policy_model = fc_policy_head(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer) v = VNetwork(value_model, value_optimizer, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer) return VPG(features, v, policy, gamma=gamma, min_batch_size=min_batch_size)
def __init__(self, features, v, policy, buffer, action_space, discount_factor=0.99, sigma=1.0, sigma_decay=0.9995, sigma_min=0.1, n_iter=100, minibatch_size=32, log=True, writer=DummyWriter()): self.features = features self.v = v self.policy = policy self.replay_buffer = buffer self.minibatch_size = minibatch_size self.discount_factor = discount_factor self._log = log self.writer = writer self.sigma = sigma self.sigma_decay = sigma_decay self.sigma_min = sigma_min self.n_iter = n_iter self._features = None self._action = None self._state = None self._tde = None self._action_low = torch.tensor(action_space.low, device=policy.device).float() self._action_high = torch.tensor(action_space.high, device=policy.device).float()
def __init__(self, model, optimizer, clip_grad=0, loss_scaling=1, loss=mse_loss, name='approximation', scheduler=None, target=None, writer=DummyWriter(), checkpointer=None): self.model = model self.device = next(model.parameters()).device self._target = target or TrivialTarget() self._scheduler = scheduler self._target.init(model) self._updates = 0 self._optimizer = optimizer self._loss = loss self._loss_scaling = loss_scaling self._cache = [] self._clip_grad = clip_grad self._writer = writer self._name = name if checkpointer is None: checkpointer = PeriodicCheckpointer(DEFAULT_CHECKPOINT_FREQUENCY) self._checkpointer = checkpointer self._checkpointer.init(self.model, os.path.join(writer.log_dir, name + '.pt'))
def _vsarsa(envs, writer=DummyWriter()): env = envs[0] model = fc_relu_q(env).to(device) optimizer = RMSprop(model.parameters(), lr=lr, alpha=alpha, eps=eps) q = QNetwork(model, optimizer, env.action_space.n, writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=epsilon) return VSarsa(q, policy, gamma=gamma)
def _ddqn(env, writer=DummyWriter()): action_repeat = 1 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency final_exploration_step = final_exploration_frame / action_repeat model = model_constructor(env).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QNetwork(model, optimizer, scheduler=CosineAnnealingLR(optimizer, last_update), target=FixedTarget(target_update_frequency), writer=writer) policy = SharedAutonomyPolicy(q, env.action_space.n, epsilon=0, pilot_tol=pilot_tol) if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=alpha, beta=beta, device=device) else: replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return co_DDQN(q, policy, replay_buffer, loss=weighted_smooth_l1_loss, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency)
def _dqn(env, writer=DummyWriter()): model = fc_relu_q(env).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QNetwork(model, optimizer, env.action_space.n, target=FixedTarget(target_update_frequency), loss=mse_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DQN(q, policy, replay_buffer, discount_factor=discount_factor, replay_start_size=replay_start_size, update_frequency=update_frequency, minibatch_size=minibatch_size)
def _c51(env, writer=DummyWriter()): model = fc_relu_dist_q(env, atoms=atoms).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QDist( model, optimizer, env.action_space.n, atoms, v_min=v_min, v_max=v_max, writer=writer, ) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return C51(q, replay_buffer, exploration=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer, ), discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr']) q = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget( self.hyperparameters['target_update_frequency']), writer=writer, ) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device) return C51(q, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, self.hyperparameters["final_exploration_step"] - self.hyperparameters["replay_start_size"], name="epsilon", writer=writer, ), discount_factor=self.hyperparameters["discount_factor"], minibatch_size=self.hyperparameters["minibatch_size"], replay_start_size=self.hyperparameters["replay_start_size"], update_frequency=self.hyperparameters["update_frequency"], writer=writer)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr']) q = QNetwork(self.model, optimizer, target=FixedTarget( self.hyperparameters['target_update_frequency']), writer=writer) policy = GreedyPolicy( q, self.n_actions, epsilon=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], self.hyperparameters['replay_start_size'], self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], name="exploration", writer=writer)) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device) return DQN( q, policy, replay_buffer, discount_factor=self.hyperparameters['discount_factor'], minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], )
def _a2c(envs, writer=DummyWriter()): env = envs[0] feature_model = feature_model_constructor(env).to(device) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad) v = VNetwork( value_model, value_optimizer, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer ) return A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=entropy_loss_scaling, writer=writer )
def __init__(self, policy, q_1, q_2, v, replay_buffer, discount_factor=0.99, entropy_target=-2., lr_temperature=1e-4, minibatch_size=32, replay_start_size=5000, temperature_initial=0.1, update_frequency=1, writer=DummyWriter()): # objects self.policy = policy self.v = v self.q_1 = q_1 self.q_2 = q_2 self.replay_buffer = replay_buffer self.writer = writer # hyperparameters self.discount_factor = discount_factor self.entropy_target = entropy_target self.lr_temperature = lr_temperature self.minibatch_size = minibatch_size self.replay_start_size = replay_start_size self.temperature = temperature_initial self.update_frequency = update_frequency # private self._state = None self._action = None self._frames_seen = 0
def _vqn(envs, writer=DummyWriter()): action_repeat = 4 final_exploration_timestep = final_exploration_frame / action_repeat env = envs[0] model = model_constructor(env).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QNetwork( model, optimizer, writer=writer ) policy = ParallelGreedyPolicy( q, env.action_space.n, epsilon=LinearScheduler( initial_exploration, final_exploration, 0, final_exploration_timestep, name="epsilon", writer=writer ) ) return DeepmindAtariBody( VQN(q, policy, discount_factor=discount_factor), )
def __init__( self, q_dist, replay_buffer, discount_factor=0.99, eps=1e-5, exploration=0.02, minibatch_size=32, replay_start_size=5000, update_frequency=1, writer=DummyWriter(), ): # objects self.q_dist = q_dist self.replay_buffer = replay_buffer self.writer = writer # hyperparameters self.eps = eps self.exploration = exploration self.replay_start_size = replay_start_size self.update_frequency = update_frequency self.minibatch_size = minibatch_size self.discount_factor = discount_factor # private self._state = None self._action = None self._frames_seen = 0
def __init__(self, features, v, policy, discount_factor=0.99, entropy_loss_scaling=0.01, epochs=4, epsilon=0.2, lam=0.95, minibatches=4, n_envs=None, n_steps=4, writer=DummyWriter()): if n_envs is None: raise RuntimeError("Must specify n_envs.") # objects self.features = features self.v = v self.policy = policy self.writer = writer # hyperparameters self.discount_factor = discount_factor self.entropy_loss_scaling = entropy_loss_scaling self.epochs = epochs self.epsilon = epsilon self.lam = lam self.minibatches = minibatches self.n_envs = n_envs self.n_steps = n_steps # private self._states = None self._actions = None self._batch_size = n_envs * n_steps self._buffer = self._make_buffer()
def _ppo(envs, writer=DummyWriter()): env = envs[0] feature_model = fc_relu_features(env).to(device) value_model = fc_value_head().to(device) policy_model = fc_policy_head(env).to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr) value_optimizer = Adam(value_model.parameters(), lr=lr) policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad) v = VNetwork(value_model, value_optimizer, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer) return PPO(features, v, policy, epsilon=epsilon, epochs=epochs, lam=lam, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=entropy_loss_scaling, writer=writer)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) features = FeatureNetwork(self.feature_model, feature_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) v = VNetwork(self.value_model, value_optimizer, loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, clip_grad=self.hyperparameters["clip_grad"], writer=writer) return VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"])
def _ppo(envs, writer=DummyWriter()): env = envs[0] # Update epoch * minibatches times per update, # but we only update once per n_steps, # with n_envs and 4 frames per step final_anneal_step = last_frame * epochs * minibatches / (n_steps * n_envs * 4) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_model = feature_model_constructor().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step), writer=writer) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step), ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step), ) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler(clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, entropy_loss_scaling=entropy_loss_scaling, writer=writer, ))