def __init__(self, n_action, init_epsilon, final_epsilon, gamma, buffer_size, batch_size, replace_iter, annealing, learning_rate, ctx): self.n_action = n_action self.epsilon = init_epsilon self.init_epsilon = init_epsilon self.final_epsilon = final_epsilon # discount factor self.gamma = gamma # memory buffer size self.buffer_size = buffer_size self.batch_size = batch_size # replace the parameters of the target network every T time steps self.replace_iter = replace_iter # The number of step it will take to linearly anneal the epsilon to its min value self.annealing = annealing self.learning_rate = learning_rate self.ctx = ctx self.total_steps = 0 self.replay_buffer = MemoryBuffer(self.buffer_size, ctx) # use deque # build the network self.target_network = DoubleQNetwork(n_action) self.main_network = DoubleQNetwork(n_action) self.target_network.collect_params().initialize( init.Xavier(), ctx=ctx) # initialize the params self.main_network.collect_params().initialize(init.Xavier(), ctx=ctx) # optimize the main network self.optimizer = gluon.Trainer(self.main_network.collect_params(), 'adam', {'learning_rate': self.learning_rate})
def __init__(self, action_dim, action_bound, actor_learning_rate, critic_learning_rate, batch_size, memory_size, gamma, tau, explore_steps, policy_update, policy_noise, explore_noise, noise_clip, ctx): self.action_dim = action_dim self.action_bound = nd.array(action_bound, ctx=ctx) self.actor_learning_rate = actor_learning_rate self.critic_learning_rate = critic_learning_rate self.batch_size = batch_size self.memory_size = memory_size self.gamma = gamma self.tau = tau self.explore_steps = explore_steps self.policy_update = policy_update self.policy_noise = policy_noise self.explore_noise = explore_noise self.noise_clip = noise_clip self.ctx = ctx self.main_actor_network = Actor(action_dim, self.action_bound) self.target_actor_network = Actor(action_dim, self.action_bound) self.main_critic_network1 = Critic() self.target_critic_network1 = Critic() self.main_critic_network2 = Critic() self.target_critic_network2 = Critic() self.main_actor_network.collect_params().initialize(init=init.Xavier(), ctx=ctx) self.target_actor_network.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.main_critic_network1.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.target_critic_network1.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.main_critic_network2.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.target_critic_network2.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.actor_optimizer = gluon.Trainer( self.main_actor_network.collect_params(), 'adam', {'learning_rate': self.actor_learning_rate}) self.critic1_optimizer = gluon.Trainer( self.main_critic_network1.collect_params(), 'adam', {'learning_rate': self.critic_learning_rate}) self.critic2_optimizer = gluon.Trainer( self.main_critic_network2.collect_params(), 'adam', {'learning_rate': self.critic_learning_rate}) self.total_steps = 0 self.total_train_steps = 0 self.memory_buffer = MemoryBuffer(buffer_size=self.memory_size, ctx=ctx)
def __init__(self, act_dim, env_dim, act_range, buffer_size=20000, gamma=0.99, lr=0.00005, tau=0.001): """Initialization""" # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = env_dim self.gamma = gamma self.lr = lr # Create actor and critic networks self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau) self.critic = Critic(self.env_dim, act_dim, lr, tau) self.buffer = MemoryBuffer(buffer_size)
class DoubleDQN: def __init__(self, n_action, init_epsilon, final_epsilon, gamma, buffer_size, batch_size, replace_iter, annealing, learning_rate, ctx): self.n_action = n_action self.epsilon = init_epsilon self.init_epsilon = init_epsilon self.final_epsilon = final_epsilon # discount factor self.gamma = gamma # memory buffer size self.buffer_size = buffer_size self.batch_size = batch_size # replace the parameters of the target network every T time steps self.replace_iter = replace_iter # The number of step it will take to linearly anneal the epsilon to its min value self.annealing = annealing self.learning_rate = learning_rate self.ctx = ctx self.total_steps = 0 self.replay_buffer = MemoryBuffer(self.buffer_size, ctx) # use deque # build the network self.target_network = DoubleQNetwork(n_action) self.main_network = DoubleQNetwork(n_action) self.target_network.collect_params().initialize( init.Xavier(), ctx=ctx) # initialize the params self.main_network.collect_params().initialize(init.Xavier(), ctx=ctx) # optimize the main network self.optimizer = gluon.Trainer(self.main_network.collect_params(), 'adam', {'learning_rate': self.learning_rate}) def choose_action(self, state): state = nd.array([state], ctx=self.ctx) if nd.random.uniform(0, 1) > self.epsilon: # choose the best action q_value = self.main_network(state) action = int(nd.argmax(q_value, axis=1).asnumpy()) else: # random choice action = random.choice(range(self.n_action)) # anneal self.epsilon = max( self.final_epsilon, self.epsilon - (self.init_epsilon - self.final_epsilon) / self.annealing) self.total_steps += 1 return action def update(self): state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.replay_buffer.sample( self.batch_size) with autograd.record(): # get the Q(s,a) all_current_q_value = self.main_network(state_batch) main_q_value = nd.pick(all_current_q_value, action_batch) # different from DQN # get next action from main network, then get its Q value from target network all_next_q_value = self.target_network( next_state_batch).detach() # only get gradient of main network max_action = nd.argmax(all_current_q_value, axis=1) target_q_value = nd.pick(all_next_q_value, max_action).detach() target_q_value = reward_batch + ( 1 - done_batch) * self.gamma * target_q_value # record loss loss = gloss.L2Loss() value_loss = loss(target_q_value, main_q_value) self.main_network.collect_params().zero_grad() value_loss.backward() self.optimizer.step(batch_size=self.batch_size) def replace_parameters(self): self.main_network.save_parameters('Double_DQN_temp_params') self.target_network.load_parameters('Double_DQN_temp_params') print('Double_DQN parameters replaced') def save_parameters(self): self.target_network.save_parameters( 'Double_DQN_target_network_parameters') self.main_network.save_parameters('Double_DQN_main_network_parameters') def load_parameters(self): self.target_network.load_parameters( 'Double_DQN_target_network_parameters') self.main_network.load_parameters('Double_DQN_main_network_parameters')
class DDPG: def __init__(self, action_dim, action_bound, actor_learning_rate, critic_learning_rate, batch_size, memory_size, gamma, tau, explore_steps, explore_noise, noise_clip, ctx): self.action_dim = action_dim self.action_bound = nd.array(action_bound, ctx=ctx) self.actor_learning_rate = actor_learning_rate self.critic_learning_rate = critic_learning_rate self.batch_size = batch_size self.memory_size = memory_size self.gamma = gamma self.tau = tau self.explore_steps = explore_steps self.explore_noise = explore_noise self.noise_clip = noise_clip self.ctx = ctx self.total_steps = 0 self.memory_buffer = MemoryBuffer(self.memory_size, ctx=ctx) self.target_actor_network = ActorNetwork(self.action_dim, self.action_bound) self.main_actor_network = ActorNetwork(self.action_dim, self.action_bound) self.target_critic_network = CriticNetwork() self.main_critic_network = CriticNetwork() self.target_actor_network.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.target_critic_network.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.main_actor_network.collect_params().initialize(init=init.Xavier(), ctx=ctx) self.main_critic_network.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.actor_optimizer = gluon.Trainer( self.main_actor_network.collect_params(), 'adam', {'learning_rate': self.actor_learning_rate}) self.critic_optimizer = gluon.Trainer( self.main_critic_network.collect_params(), 'adam', {'learning_rate': self.critic_learning_rate}) def choose_action_train(self, state): state = nd.array([state], ctx=self.ctx) action = self.main_actor_network(state) # no noise clip noise = nd.normal(loc=0, scale=self.explore_noise, shape=action.shape, ctx=self.ctx) action += noise clipped_action = self.action_clip(action) return clipped_action def choose_action_evaluate(self, state): state = nd.array([state], ctx=self.ctx) action = self.main_actor_network(state) return action def action_clip(self, action): low_bound = [ float(self.action_bound[i][0].asnumpy()) for i in range(self.action_dim) ] high_bound = [ float(self.action_bound[i][1].asnumpy()) for i in range(self.action_dim) ] bound = list(zip(low_bound, high_bound)) # clip and reshape action_list = [ nd.clip(action[:, i], bound[i][0], bound[i][1]).reshape(-1, 1) for i in range(self.action_dim) ] # concat clipped_action = reduce(nd.concat, action_list) return clipped_action.squeeze() def soft_update(self, target_network, main_network): target_parameters = target_network.collect_params().keys() main_parameters = main_network.collect_params().keys() d = zip(target_parameters, main_parameters) for x, y in d: target_network.collect_params()[x].data()[:] = \ target_network.collect_params()[x].data() * \ (1 - self.tau) + main_network.collect_params()[y].data() * self.tau def update(self): state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory_buffer.sample( self.batch_size) # ---------------optimize critic------------------ with autograd.record(): next_action_batch = self.target_actor_network(next_state_batch) next_q = self.target_critic_network(next_state_batch, next_action_batch).squeeze() target_q = reward_batch + (1 - done_batch) * self.gamma * next_q current_q = self.main_critic_network(state_batch, action_batch) loss = gloss.L2Loss() value_loss = loss(target_q.detach(), current_q) self.main_critic_network.collect_params().zero_grad() value_loss.backward() self.critic_optimizer.step(self.batch_size) # ---------------optimize actor------------------- with autograd.record(): pred_action_batch = self.main_actor_network(state_batch) actor_loss = -nd.mean( self.main_critic_network(state_batch, pred_action_batch)) self.main_actor_network.collect_params().zero_grad() actor_loss.backward() self.actor_optimizer.step(1) self.soft_update(self.target_actor_network, self.main_actor_network) self.soft_update(self.target_critic_network, self.main_critic_network) def save(self): self.main_actor_network.save_parameters( 'DDPG Pendulum Main Actor.params') self.target_actor_network.save_parameters( 'DDPG Pendulum Target Actor.params') self.main_critic_network.save_parameters( 'DDPG Pendulum Main Critic.params') self.target_critic_network.save_parameters( 'DDPG Pendulum Target Critic.params') def load(self): self.main_actor_network.load_parameters( 'DDPG Pendulum Main Actor.params') self.target_actor_network.load_parameters( 'DDPG Pendulum Target Actor.params') self.main_critic_network.load_parameters( 'DDPG Pendulum Main Critic.params') self.target_critic_network.load_parameters( 'DDPG Pendulum Target Critic.params')
def __init__( self, capacity_per_level=500000, warmup_steps=100000, n_frames=4, n_atoms=51, v_min=-1, v_max=0, gamma=.99, device='cuda', batch_size=48, lr=0.0000625 * 2, lr_decay=0.99, update_target_net_every=25000, train_every=6, frame_skip=4, disable_noisy_after=2000000, super_hexagon_path='C:\\Program Files (x86)\\Steam\\steamapps\\common\\Super Hexagon\\superhexagon.exe', run_afap=True): # training objects self.memory_buffer = MemoryBuffer( capacity_per_level, SuperHexagonInterface.n_levels, n_frames, SuperHexagonInterface.frame_size, SuperHexagonInterface.frame_size_cropped, gamma, device=device) self.net = Network(n_frames, SuperHexagonInterface.n_actions, n_atoms).to(device) self.target_net = Network(n_frames, SuperHexagonInterface.n_actions, n_atoms).to(device) self.target_net.load_state_dict(self.net.state_dict()) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=lr, eps=1.5e-4) self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR( self.optimizer, ExpLrDecay(lr_decay, min_factor=.1)) # parameters self.batch_size = batch_size self.update_target_net_every = update_target_net_every self.train_every = train_every self.frame_skip = frame_skip self.disable_noisy_after = disable_noisy_after self.warmup_steps = warmup_steps self.gamma = gamma self.device = device # parameters for distributional self.n_atoms = n_atoms self.v_min = v_min self.v_max = v_max self.delta_z = (v_max - v_min) / (n_atoms - 1) self.support = torch.linspace(v_min, v_max, n_atoms, dtype=torch.float, device=device) self.offset = torch.arange(0, batch_size * n_atoms, n_atoms, device=device).view(-1, 1) self.m = torch.empty((batch_size, n_atoms), device=device) # debug and logging stuff self.list_steps_alive = [[] for _ in range(SuperHexagonInterface.n_levels) ] self.longest_run = [(0, 0)] * SuperHexagonInterface.n_levels self.total_simulated_steps = [0] * SuperHexagonInterface.n_levels self.losses = [] self.kls = [] self.times = [] self.iteration = 0 self.super_hexagon_path = super_hexagon_path self.run_afap = run_afap
class Trainer: def __init__( self, capacity_per_level=500000, warmup_steps=100000, n_frames=4, n_atoms=51, v_min=-1, v_max=0, gamma=.99, device='cuda', batch_size=48, lr=0.0000625 * 2, lr_decay=0.99, update_target_net_every=25000, train_every=6, frame_skip=4, disable_noisy_after=2000000, super_hexagon_path='C:\\Program Files (x86)\\Steam\\steamapps\\common\\Super Hexagon\\superhexagon.exe', run_afap=True): # training objects self.memory_buffer = MemoryBuffer( capacity_per_level, SuperHexagonInterface.n_levels, n_frames, SuperHexagonInterface.frame_size, SuperHexagonInterface.frame_size_cropped, gamma, device=device) self.net = Network(n_frames, SuperHexagonInterface.n_actions, n_atoms).to(device) self.target_net = Network(n_frames, SuperHexagonInterface.n_actions, n_atoms).to(device) self.target_net.load_state_dict(self.net.state_dict()) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=lr, eps=1.5e-4) self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR( self.optimizer, ExpLrDecay(lr_decay, min_factor=.1)) # parameters self.batch_size = batch_size self.update_target_net_every = update_target_net_every self.train_every = train_every self.frame_skip = frame_skip self.disable_noisy_after = disable_noisy_after self.warmup_steps = warmup_steps self.gamma = gamma self.device = device # parameters for distributional self.n_atoms = n_atoms self.v_min = v_min self.v_max = v_max self.delta_z = (v_max - v_min) / (n_atoms - 1) self.support = torch.linspace(v_min, v_max, n_atoms, dtype=torch.float, device=device) self.offset = torch.arange(0, batch_size * n_atoms, n_atoms, device=device).view(-1, 1) self.m = torch.empty((batch_size, n_atoms), device=device) # debug and logging stuff self.list_steps_alive = [[] for _ in range(SuperHexagonInterface.n_levels) ] self.longest_run = [(0, 0)] * SuperHexagonInterface.n_levels self.total_simulated_steps = [0] * SuperHexagonInterface.n_levels self.losses = [] self.kls = [] self.times = [] self.iteration = 0 self.super_hexagon_path = super_hexagon_path self.run_afap = run_afap def warmup(self, game, log_every): t = True for i in range(1, self.warmup_steps + 1): if i % log_every == 0: print('Warmup', i) if t: self.total_simulated_steps[game.level] += game.simulated_steps if self.total_simulated_steps[ game.level] > self.total_simulated_steps[game.level - 1]: game.select_level((game.level + 1) % 6) f, fc = game.reset() self.memory_buffer.insert_first(game.level, f, fc) a = np.random.randint(0, 3) (f, fc), r, t = game.step(a) self.memory_buffer.insert(game.level, a, r, t, f, fc) return t def train( self, save_every=50000, save_name='trainer', log_every=1000, ): game = SuperHexagonInterface(self.frame_skip, self.super_hexagon_path, run_afap=self.run_afap, allow_game_restart=True) # if trainer was loaded, select the level that was played the least if any(x != 0 for x in self.total_simulated_steps): game.select_level(np.argmin(self.total_simulated_steps).item()) # init state f, fc = np.zeros(game.frame_size, dtype=np.bool), np.zeros(game.frame_size_cropped, dtype=np.bool) sf, sfc = torch.zeros((1, 4, *game.frame_size), device=self.device), torch.zeros( (1, 4, *game.frame_size_cropped), device=self.device) t = True # run warmup is necessary if self.iteration == 0: if os.path.exists('warmup_buffer.npz'): self.memory_buffer.load_warmup('warmup_buffer.npz') else: t = self.warmup(game, log_every) self.memory_buffer.save_warmup('warmup_buffer.npz') # trainings loop last_time = time() save_when_terminal = False while True: self.iteration += 1 # disable noisy if self.iteration == self.disable_noisy_after: self.net.eval() self.target_net.eval() # log if self.iteration % log_every == 0 and all( len(l) > 0 for l in self.list_steps_alive): print( f'{self.iteration} | ' f'{[round(np.mean(np.array(l[-100:])[:, 1]) / 60, 2) for l in self.list_steps_alive]}s | ' f'{[round(r[1] / 60, 2) for r in self.longest_run]}s | ' f'{self.total_simulated_steps} | ' f'{time() - last_time:.2f}s | ' f'{np.mean(self.losses[-log_every:])} | ' f'{np.mean(self.kls[-log_every:])} | ' f'{self.lr_scheduler.get_last_lr()[0]} | ' f'{game.level}') # indicate that the trainer should be saved the next time the agent dies if self.iteration % save_every == 0: save_when_terminal = True # update target net if self.iteration % self.update_target_net_every == 0: self.lr_scheduler.step() self.target_net.load_state_dict(self.net.state_dict()) # if terminal if t: # select next level if this level was played at least as long as the previous level if self.total_simulated_steps[ game.level] > self.total_simulated_steps[game.level - 1]: game.select_level((game.level + 1) % 6) f, fc = game.reset() self.memory_buffer.insert_first(game.level, f, fc) sf.zero_() sfc.zero_() # update state sf[0, 1:] = sf[0, :-1].clone() sfc[0, 1:] = sfc[0, :-1].clone() sf[0, 0] = torch.from_numpy(f).to(self.device) sfc[0, 0] = torch.from_numpy(fc).to(self.device) # train if self.iteration % self.train_every == 0: loss, kl = self.train_batch() self.losses.append(loss) self.kls.append(kl) # act with torch.no_grad(): self.net.reset_noise() a = (self.net(sf, sfc) * self.support).sum(dim=2).argmax(dim=1).item() (f, fc), r, t = game.step(a) self.memory_buffer.insert(game.level, a, r, t, f, fc) # if terminal if t: if game.steps_alive > self.longest_run[game.level][1]: self.longest_run[game.level] = (self.iteration, game.steps_alive) self.list_steps_alive[game.level].append( (self.iteration, game.steps_alive)) self.total_simulated_steps[game.level] += game.simulated_steps self.times.append(time() - last_time) if save_when_terminal: print('saving...') for _ in range(60): game.game.step(False) self.save(save_name) for _ in range(60): game.game.step(False) save_when_terminal = False def train_batch(self): # sample minibatch f, fc, a, r, t, f1, fc1 = self.memory_buffer.make_batch( self.batch_size) # compute target q distribution with torch.no_grad(): self.target_net.reset_noise() qdn = self.target_net(f1, fc1) an = (qdn * self.support).sum(dim=2).argmax(dim=1) Tz = (r.unsqueeze(1) + t.logical_not().unsqueeze(1) * self.gamma * self.support).clamp_( self.v_min, self.v_max) b = (Tz - self.v_min) / self.delta_z l = b.floor().long() u = b.ceil().long() l[(u > 0) & (l == u)] -= 1 u[(l == u)] += 1 vdn = qdn.gather( 1, an.view(-1, 1, 1).expand(self.batch_size, -1, self.n_atoms)).view(self.batch_size, self.n_atoms) self.m.zero_() self.m.view(-1).index_add_(0, (l + self.offset).view(-1), (vdn * (u - b)).view(-1)) self.m.view(-1).index_add_(0, (u + self.offset).view(-1), (vdn * (b - l)).view(-1)) # forward and backward pass qld = self.net(f, fc, log=True) vld = qld.gather( 1, a.view(-1, 1, 1).expand(self.batch_size, -1, self.n_atoms)).view(self.batch_size, self.n_atoms) loss = -torch.sum(self.m * vld, dim=1).mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() kl = F.kl_div(vld.detach(), self.m, reduction='batchmean') return loss.detach().item(), kl.item() def save(self, file_name='trainer'): # first backup the last save file # in case anything goes wrong file_name_backup = file_name + '_backup' if os.path.exists(file_name): os.rename(file_name, file_name_backup) # save this object with open(file_name, 'wb') as f: pickle.dump(self, f) # remove backup if nothing went wrong if os.path.exists(file_name_backup): os.remove(file_name_backup) @staticmethod def load(file_name='trainer'): with open(file_name, 'rb') as f: ret = pickle.load(f) assert ret.memory_buffer.last_was_terminal return ret
class TD3: def __init__(self, action_dim, action_bound, actor_learning_rate, critic_learning_rate, batch_size, memory_size, gamma, tau, explore_steps, policy_update, policy_noise, explore_noise, noise_clip, ctx): self.action_dim = action_dim self.action_bound = nd.array(action_bound, ctx=ctx) self.actor_learning_rate = actor_learning_rate self.critic_learning_rate = critic_learning_rate self.batch_size = batch_size self.memory_size = memory_size self.gamma = gamma self.tau = tau self.explore_steps = explore_steps self.policy_update = policy_update self.policy_noise = policy_noise self.explore_noise = explore_noise self.noise_clip = noise_clip self.ctx = ctx self.main_actor_network = Actor(action_dim, self.action_bound) self.target_actor_network = Actor(action_dim, self.action_bound) self.main_critic_network1 = Critic() self.target_critic_network1 = Critic() self.main_critic_network2 = Critic() self.target_critic_network2 = Critic() self.main_actor_network.collect_params().initialize(init=init.Xavier(), ctx=ctx) self.target_actor_network.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.main_critic_network1.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.target_critic_network1.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.main_critic_network2.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.target_critic_network2.collect_params().initialize( init=init.Xavier(), ctx=ctx) self.actor_optimizer = gluon.Trainer( self.main_actor_network.collect_params(), 'adam', {'learning_rate': self.actor_learning_rate}) self.critic1_optimizer = gluon.Trainer( self.main_critic_network1.collect_params(), 'adam', {'learning_rate': self.critic_learning_rate}) self.critic2_optimizer = gluon.Trainer( self.main_critic_network2.collect_params(), 'adam', {'learning_rate': self.critic_learning_rate}) self.total_steps = 0 self.total_train_steps = 0 self.memory_buffer = MemoryBuffer(buffer_size=self.memory_size, ctx=ctx) def choose_action_train(self, state): state = nd.array([state], ctx=self.ctx) action = self.main_actor_network(state) # no noise clip noise = nd.normal(loc=0, scale=self.explore_noise, shape=action.shape, ctx=self.ctx) action += noise clipped_action = self.action_clip(action) return clipped_action # when you test the agent, use this to choose action. def choose_action_evaluate(self, state): state = nd.array([state], ctx=self.ctx) action = self.main_actor_network(state) return action # after adding the noise to action, you need to clip it to restrain it between available action bound. # Maybe you have a better way to do it. I think i make it too complicated!!!! def action_clip(self, action): low_bound = [ float(self.action_bound[i][0].asnumpy()) for i in range(self.action_dim) ] high_bound = [ float(self.action_bound[i][1].asnumpy()) for i in range(self.action_dim) ] bound = list(zip(low_bound, high_bound)) # clip and reshape action_list = [ nd.clip(action[:, i], bound[i][0], bound[i][1]).reshape(-1, 1) for i in range(self.action_dim) ] # concat clipped_action = reduce(nd.concat, action_list) return clipped_action.squeeze() def soft_update(self, target_network, main_network): target_parameters = target_network.collect_params().keys() main_parameters = main_network.collect_params().keys() d = zip(target_parameters, main_parameters) for x, y in d: target_network.collect_params()[x].data()[:] = \ target_network.collect_params()[x].data() * \ (1 - self.tau) + main_network.collect_params()[y].data() * self.tau def update(self): self.total_train_steps += 1 state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory_buffer.sample( self.batch_size) # --------------optimize the critic network-------------------- with autograd.record(): # choose next action according to target policy network next_action_batch = self.target_actor_network(next_state_batch) noise = nd.normal(loc=0, scale=self.policy_noise, shape=next_action_batch.shape, ctx=self.ctx) # with noise clip noise = nd.clip(noise, a_min=-self.noise_clip, a_max=self.noise_clip) next_action_batch = next_action_batch + noise clipped_action = self.action_clip(next_action_batch) # get target q value target_q_value1 = self.target_critic_network1( next_state_batch, clipped_action) target_q_value2 = self.target_critic_network2( next_state_batch, clipped_action) target_q_value = nd.minimum(target_q_value1, target_q_value2).squeeze() target_q_value = reward_batch + (1.0 - done_batch) * ( self.gamma * target_q_value) # get current q value current_q_value1 = self.main_critic_network1( state_batch, action_batch) current_q_value2 = self.main_critic_network2( state_batch, action_batch) loss = gloss.L2Loss() value_loss1 = loss(current_q_value1, target_q_value.detach()) value_loss2 = loss(current_q_value2, target_q_value.detach()) self.main_critic_network1.collect_params().zero_grad() value_loss1.backward() self.critic1_optimizer.step(self.batch_size) self.main_critic_network2.collect_params().zero_grad() value_loss2.backward() self.critic2_optimizer.step(self.batch_size) # ---------------optimize the actor network------------------------- if self.total_train_steps % self.policy_update == 0: with autograd.record(): pred_action_batch = self.main_actor_network(state_batch) actor_loss = -nd.mean( self.main_critic_network1(state_batch, pred_action_batch)) self.main_actor_network.collect_params().zero_grad() actor_loss.backward() self.actor_optimizer.step(1) self.soft_update(self.target_actor_network, self.main_actor_network) self.soft_update(self.target_critic_network1, self.main_critic_network1) self.soft_update(self.target_critic_network2, self.main_critic_network2) def save(self): self.main_actor_network.save_parameters( 'TD3 LunarLander main actor network.params') self.target_actor_network.save_parameters( 'TD3 LunarLander target actor network.params') self.main_critic_network1.save_parameters( 'TD3 LunarLander main critic network.params') self.main_critic_network2.save_parameters( 'TD3 LunarLander main critic network.params') self.target_critic_network1.save_parameters( 'TD3 LunarLander target critic network.params') self.target_critic_network2.save_parameters( 'TD3 LunarLander target critic network.params') def load(self): self.main_actor_network.load_parameters( 'TD3 LunarLander main actor network.params') self.target_actor_network.load_parameters( 'TD3 LunarLander target actor network.params') self.main_critic_network1.load_parameters( 'TD3 LunarLander main critic network.params') self.main_critic_network2.load_parameters( 'TD3 LunarLander main critic network.params') self.target_critic_network1.load_parameters( 'TD3 LunarLander target critic network.params') self.target_critic_network2.load_parameters( 'TD3 LunarLander target critic network.params')
class DDPG: """ Deep Deterministic Policy Gradient (DDPG) Helper Class""" def __init__(self, act_dim, env_dim, act_range, buffer_size=20000, gamma=0.99, lr=0.00005, tau=0.001): """Initialization""" # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = env_dim self.gamma = gamma self.lr = lr # Create actor and critic networks self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau) self.critic = Critic(self.env_dim, act_dim, lr, tau) self.buffer = MemoryBuffer(buffer_size) def policy_action(self, s): """Use the actor to predict value""" return self.actor.predict(s)[0] def bellman(self, rewards, q_values, dones): """Use the Bellman Equation to compute the critic target""" critic_target = np.asarray(q_values) for i in range(q_values.shape[0]): if dones[i]: critic_target[i] = rewards[i] else: critic_target[i] = rewards[i] + self.gamma * q_values[i] return critic_target def memorize(self, state, action, reward, done, new_state): """ Store experience in memory buffer""" self.buffer.memorize(state, action, reward, done, new_state) def sample_batch(self, batch_size): return self.buffer.sample_batch(batch_size) def update_models(self, states, actions, critic_target): """ Update actor and critic networks from sampled experience""" # Train critic self.critic.train_on_batch(states, actions, critic_target) # Q-Value Gradients under Current Policy actions = self.actor.model.predict(states) grads = self.critic.gradients(states, actions) # Train actor self.actor.train(states, actions, np.array( grads).reshape((-1, self.act_dim))) # Transfer weights to target networks at rate Tau self.actor.transfer_weights() self.critic.transfer_weights() def train(self, env, summary_writer, nb_episodes=12, batch_size=64, render=False, gather_train_stats=False): results = [] # First, gather experience tqdm_e = tqdm(range(nb_episodes), desc='Score', leave=True, unit=" episodes") for e in tqdm_e: # Reset episode time, cumul_reward, done = 0, 0, False old_state = env.reset() actions, states, rewards = [], [], [] noise = OrnsteinUhlenbeckProcess(size=self.act_dim) while not done: if render: env.render() # Actor picks an action (following the deterministic policy) a = self.policy_action(old_state) # Clip continuous values to be valid w.r.t. environment a = np.clip(a+noise.generate(time), - self.act_range, self.act_range) # Retrieve new state, reward, and whether the state is terminal new_state, r, done, _ = env.step(a) # Add outputs to memory buffer self.memorize(old_state, a, r, done, new_state) # Sample experience from buffer states, actions, rewards, dones, new_states, _ = self.sample_batch( batch_size) # Predict target q-values using target networks q_values = self.critic.target_predict( [new_states, self.actor.target_predict(new_states)]) # Compute critic target critic_target = self.bellman(rewards, q_values, dones) # Train both networks on sampled batch, update target networks self.update_models(states, actions, critic_target) # Update current state old_state = new_state cumul_reward += r time += 1 # Gather stats every episode for plotting if(gather_train_stats): mean, stdev = gather_stats(self, env) results.append([e, mean, stdev]) # Export results for Tensorboard score = tf_summary('score', cumul_reward) summary_writer.add_summary(score, global_step=e) summary_writer.flush() # Display score tqdm_e.set_description("Score: " + str(cumul_reward)) tqdm_e.refresh() return results def save_weights(self, path): path += '_LR_{}'.format(self.lr) self.actor.save(path) self.critic.save(path) def load_weights(self, path_actor, path_critic): self.critic.load_weights(path_critic) self.actor.load_weights(path_actor)