def __init__(self, n_action, init_epsilon, final_epsilon, gamma, buffer_size, batch_size, replace_iter, annealing, learning_rate, ctx): self.n_action = n_action self.epsilon = init_epsilon self.init_epsilon = init_epsilon self.final_epsilon = final_epsilon # discount factor self.gamma = gamma # memory buffer size self.buffer_size = buffer_size self.batch_size = batch_size # replace the parameters of the target network every T time steps self.replace_iter = replace_iter # The number of step it will take to linearly anneal the epsilon to its min value self.annealing = annealing self.learning_rate = learning_rate self.ctx = ctx self.total_steps = 0 self.replay_buffer = MemoryBuffer(self.buffer_size, ctx) # use deque # build the network self.target_network = DoubleQNetwork(n_action) self.main_network = DoubleQNetwork(n_action) self.target_network.collect_params().initialize( init.Xavier(), ctx=ctx) # initialize the params self.main_network.collect_params().initialize(init.Xavier(), ctx=ctx) # optimize the main network self.optimizer = gluon.Trainer(self.main_network.collect_params(), 'adam', {'learning_rate': self.learning_rate})
def __init__(self, action_dim, action_bound, actor_learning_rate, critic_learning_rate, batch_size, memory_size, gamma, tau, explore_steps, policy_update, policy_noise, explore_noise, noise_clip, ctx): self.action_dim = action_dim self.action_bound = nd.array(action_bound, ctx=ctx) self.actor_learning_rate = actor_learning_rate self.critic_learning_rate = critic_learning_rate self.batch_size = batch_size self.memory_size = memory_size self.gamma = gamma self.tau = tau self.explore_steps = explore_steps self.policy_update = policy_update self.policy_noise = policy_noise self.explore_noise = explore_noise self.noise_clip = noise_clip self.ctx = ctx self.main_actor_network = Actor(action_dim, self.action_bound) self.target_actor_network = Actor(action_dim, self.action_bound) self.main_critic_network1 = Critic() self.target_critic_network1 = Critic() self.main_critic_network2 = Critic() self.target_critic_network2 = Critic() self.main_actor_network.collect_params().initialize(init=init.Xavier(), ctx=ctx) self.target_actor_network.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.main_critic_network1.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.target_critic_network1.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.main_critic_network2.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.target_critic_network2.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.actor_optimizer = gluon.Trainer( self.main_actor_network.collect_params(), 'adam', {'learning_rate': self.actor_learning_rate}) self.critic1_optimizer = gluon.Trainer( self.main_critic_network1.collect_params(), 'adam', {'learning_rate': self.critic_learning_rate}) self.critic2_optimizer = gluon.Trainer( self.main_critic_network2.collect_params(), 'adam', {'learning_rate': self.critic_learning_rate}) self.total_steps = 0 self.total_train_steps = 0 self.memory_buffer = MemoryBuffer(buffer_size=self.memory_size, ctx=ctx)
def __init__(self, act_dim, env_dim, act_range, buffer_size=20000, gamma=0.99, lr=0.00005, tau=0.001): """Initialization""" # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = env_dim self.gamma = gamma self.lr = lr # Create actor and critic networks self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau) self.critic = Critic(self.env_dim, act_dim, lr, tau) self.buffer = MemoryBuffer(buffer_size)
def __init__( self, capacity_per_level=500000, warmup_steps=100000, n_frames=4, n_atoms=51, v_min=-1, v_max=0, gamma=.99, device='cuda', batch_size=48, lr=0.0000625 * 2, lr_decay=0.99, update_target_net_every=25000, train_every=6, frame_skip=4, disable_noisy_after=2000000, super_hexagon_path='C:\\Program Files (x86)\\Steam\\steamapps\\common\\Super Hexagon\\superhexagon.exe', run_afap=True): # training objects self.memory_buffer = MemoryBuffer( capacity_per_level, SuperHexagonInterface.n_levels, n_frames, SuperHexagonInterface.frame_size, SuperHexagonInterface.frame_size_cropped, gamma, device=device) self.net = Network(n_frames, SuperHexagonInterface.n_actions, n_atoms).to(device) self.target_net = Network(n_frames, SuperHexagonInterface.n_actions, n_atoms).to(device) self.target_net.load_state_dict(self.net.state_dict()) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=lr, eps=1.5e-4) self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR( self.optimizer, ExpLrDecay(lr_decay, min_factor=.1)) # parameters self.batch_size = batch_size self.update_target_net_every = update_target_net_every self.train_every = train_every self.frame_skip = frame_skip self.disable_noisy_after = disable_noisy_after self.warmup_steps = warmup_steps self.gamma = gamma self.device = device # parameters for distributional self.n_atoms = n_atoms self.v_min = v_min self.v_max = v_max self.delta_z = (v_max - v_min) / (n_atoms - 1) self.support = torch.linspace(v_min, v_max, n_atoms, dtype=torch.float, device=device) self.offset = torch.arange(0, batch_size * n_atoms, n_atoms, device=device).view(-1, 1) self.m = torch.empty((batch_size, n_atoms), device=device) # debug and logging stuff self.list_steps_alive = [[] for _ in range(SuperHexagonInterface.n_levels) ] self.longest_run = [(0, 0)] * SuperHexagonInterface.n_levels self.total_simulated_steps = [0] * SuperHexagonInterface.n_levels self.losses = [] self.kls = [] self.times = [] self.iteration = 0 self.super_hexagon_path = super_hexagon_path self.run_afap = run_afap