def __init__(self, env, test_env, log_dir, num_steps=5 * (10 ** 7), batch_size=32, N=32, num_cosines=64, ent_coef=0, kappa=1.0, quantile_lr=5e-5, fraction_lr=2.5e-9, memory_size=10 ** 6, gamma=0.99, multi_step=1, update_interval=4, target_update_interval=10000, start_steps=50000, epsilon_train=0.01, epsilon_eval=0.001, epsilon_decay_steps=250000, double_q_learning=False, dueling_net=False, noisy_net=False, use_per=False, log_interval=100, eval_interval=250000, num_eval_steps=125000, max_episode_steps=27000, grad_cliping=None, cuda=True, seed=0): super(FQFAgent, self).__init__( env, test_env, log_dir, num_steps, batch_size, memory_size, gamma, multi_step, update_interval, target_update_interval, start_steps, epsilon_train, epsilon_eval, epsilon_decay_steps, double_q_learning, dueling_net, noisy_net, use_per, log_interval, eval_interval, num_eval_steps, max_episode_steps, grad_cliping, cuda, seed) # Online network. self.online_net = FQF( num_channels=env.observation_space.shape[0], num_actions=self.num_actions, N=N, num_cosines=num_cosines, dueling_net=dueling_net, noisy_net=noisy_net).to(self.device) # Target network. self.target_net = FQF( num_channels=env.observation_space.shape[0], num_actions=self.num_actions, N=N, num_cosines=num_cosines, dueling_net=dueling_net, noisy_net=noisy_net, target=True).to(self.device) # Copy parameters of the learning network to the target network. self.update_target() # Disable calculations of gradients of the target network. disable_gradients(self.target_net) self.fraction_optim = RMSprop( self.online_net.fraction_net.parameters(), lr=fraction_lr, alpha=0.95, eps=0.00001) self.quantile_optim = Adam( list(self.online_net.dqn_net.parameters()) + list(self.online_net.cosine_net.parameters()) + list(self.online_net.quantile_net.parameters()), lr=quantile_lr, eps=1e-2 / batch_size) # NOTE: The author said the training of Fraction Proposal Net is # unstable and value distribution degenerates into a deterministic # one rarely (e.g. 1 out of 20 seeds). So you can use entropy of value # distribution as a regularizer to stabilize (but possibly slow down) # training. self.ent_coef = ent_coef self.N = N self.num_cosines = num_cosines self.kappa = kappa
def __init__(self, env, test_env, log_dir, num_steps=5 * (10 ** 7), batch_size=32, N=200, kappa=1.0, lr=5e-5, memory_size=10 ** 6, gamma=0.99, multi_step=1, update_interval=4, target_update_interval=10000, start_steps=50000, epsilon_train=0.01, epsilon_eval=0.001, epsilon_decay_steps=250000, double_q_learning=False, dueling_net=False, noisy_net=False, use_per=False, log_interval=100, eval_interval=250000, num_eval_steps=125000, max_episode_steps=27000, grad_cliping=None, cuda=True, seed=0): super(QRDQNAgent, self).__init__( env, test_env, log_dir, num_steps, batch_size, memory_size, gamma, multi_step, update_interval, target_update_interval, start_steps, epsilon_train, epsilon_eval, epsilon_decay_steps, double_q_learning, dueling_net, noisy_net, use_per, log_interval, eval_interval, num_eval_steps, max_episode_steps, grad_cliping, cuda, seed) # Online network. self.online_net = QRDQN( num_channels=env.observation_space.shape[0], num_actions=self.num_actions, N=N, dueling_net=dueling_net, noisy_net=noisy_net).to(self.device) # Target network. self.target_net = QRDQN( num_channels=env.observation_space.shape[0], num_actions=self.num_actions, N=N, dueling_net=dueling_net, noisy_net=noisy_net).to(self.device).to(self.device) # Copy parameters of the learning network to the target network. self.update_target() # Disable calculations of gradients of the target network. disable_gradients(self.target_net) self.optim = Adam( self.online_net.parameters(), lr=lr, eps=1e-2 / batch_size) # Fixed fractions. taus = torch.arange( 0, N + 1, device=self.device, dtype=torch.float32) / N self.tau_hats = ((taus[1:] + taus[:-1]) / 2.0).view(1, N) self.N = N self.kappa = kappa
def __init__(self, env, test_env, log_dir, num_steps=5 * (10**7), batch_size=32, N=64, N_dash=64, K=32, num_cosines=64, kappa=1.0, lr=5e-5, memory_size=10**6, gamma=0.99, multi_step=1, update_interval=4, target_update_interval=10000, start_steps=50000, epsilon_train=0.01, epsilon_eval=0.001, epsilon_decay_steps=250000, double_q_learning=False, dueling_net=False, noisy_net=False, use_per=False, log_interval=100, eval_interval=250000, num_eval_steps=125000, max_episode_steps=27000, grad_cliping=None, cuda=True, seed=0): super(IQNAgent, self).__init__(env, test_env, log_dir, num_steps, batch_size, memory_size, gamma, multi_step, update_interval, target_update_interval, start_steps, epsilon_train, epsilon_eval, epsilon_decay_steps, double_q_learning, dueling_net, noisy_net, use_per, log_interval, eval_interval, num_eval_steps, max_episode_steps, grad_cliping, cuda, seed) # Online network. self.online_net = IQN(num_channels=env.observation_space.shape[0], num_actions=self.num_actions, K=K, num_cosines=num_cosines, dueling_net=dueling_net, noisy_net=noisy_net).to(self.device) # Target network. self.target_net = IQN(num_channels=env.observation_space.shape[0], num_actions=self.num_actions, K=K, num_cosines=num_cosines, dueling_net=dueling_net, noisy_net=noisy_net).to(self.device) # Copy parameters of the learning network to the target network. self.update_target() # Disable calculations of gradients of the target network. disable_gradients(self.target_net) self.optim = Adam(self.online_net.parameters(), lr=lr, eps=1e-2 / batch_size) self.N = N self.N_dash = N_dash self.K = K self.num_cosines = num_cosines self.kappa = kappa