class TD3(object): """Classes implementing TD3 and DDPG off-policy learners Parameters: args (object): Parameter class """ def to_cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic_target.cuda() self.critic.cuda() def __init__(self, args): self.args = args self.actor = Actor(args) self.actor.apply(utils.init_weights) self.actor_target = Actor(args) self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) self.critic = Critic(args) self.critic.apply(utils.init_weights) self.critic_target = Critic(args) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3) self.gamma = args.gamma self.tau = self.args.tau self.loss = nn.MSELoss() self.hard_update( self.actor_target, self.actor) # Make sure target is with the same weight self.hard_update(self.critic_target, self.critic) self.actor_target.cuda() self.critic_target.cuda() self.actor.cuda() self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.critic_loss = {'mean': []} self.q = {'min': [], 'max': [], 'mean': [], 'std': []} self.val = {'min': [], 'max': [], 'mean': [], 'std': []} def compute_stats(self, tensor, tracker): """Computes stats from intermediate tensors Parameters: tensor (tensor): tensor tracker (object): logger Returns: None """ tracker['min'].append(torch.min(tensor).item()) tracker['max'].append(torch.max(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, dpp, num_epoch=1): """Runs a step of Bellman upodate and policy gradient using a batch of experiences Parameters: state_batch (tensor): Current States next_state_batch (tensor): Next States action_batch (tensor): Actions reward_batch (tensor): Rewards done_batch (tensor): Done batch num_epoch (int): Number of learning iteration to run with the same data Returns: None """ if isinstance(state_batch, list): state_batch = torch.cat(state_batch) next_state_batch = torch.cat(next_state_batch) action_batch = torch.cat(action_batch) reward_batch = torch.cat(reward_batch).done_batch = torch.cat( done_batch) for _ in range(num_epoch): ########### CRITIC UPDATE #################### #Compute next q-val, next_v and target with torch.no_grad(): #Policy Noise policy_noise = np.random.normal( 0, self.args.policy_noise, (action_batch.size()[0], action_batch.size()[1])) policy_noise = torch.clamp(torch.Tensor(policy_noise), -self.args.policy_noise_clip, self.args.policy_noise_clip) #Compute next action_bacth next_action_batch = self.actor_target.forward( next_state_batch) + policy_noise.cuda() next_action_batch = torch.clamp(next_action_batch, 0, 1) #Compute Q-val and value of next state masking by done q1, q2, next_val = self.critic_target.forward( next_state_batch, next_action_batch) q1 = (1 - done_batch) * q1 q2 = (1 - done_batch) * q2 next_val = (1 - done_batch) * next_val next_q = torch.min(q1, q2) #Compute target q and target val target_q = reward_batch + (self.gamma * next_q) target_val = reward_batch + (self.gamma * next_val) self.critic_optim.zero_grad() current_q1, current_q2, current_val = self.critic.forward( (state_batch), (action_batch)) self.compute_stats(current_q1, self.q) dt = self.loss(current_q1, target_q) dt = dt + self.loss(current_val, target_val) self.compute_stats(current_val, self.val) dt = dt + self.loss(current_q2, target_q) self.critic_loss['mean'].append(dt.item()) dt.backward() self.critic_optim.step() self.num_critic_updates += 1 #Delayed Actor Update if self.num_critic_updates % self.args.policy_ups_freq == 0: actor_actions = self.actor.forward(state_batch) if dpp: policy_loss = -self.shape_dpp(self.critic, self.actor, state_batch, self.args.sensor_model) else: Q1, Q2, val = self.critic.forward(state_batch, actor_actions) policy_loss = -(Q1 - val) self.compute_stats(policy_loss, self.policy_loss) policy_loss = policy_loss.mean() self.actor_optim.zero_grad() policy_loss.backward(retain_graph=True) if self.args.action_loss: action_loss = torch.abs(actor_actions - 0.5) self.compute_stats(action_loss, self.action_loss) action_loss = action_loss.mean() * self.args.action_loss_w action_loss.backward() #if self.action_loss[-1] > self.policy_loss[-1]: self.args.action_loss_w *= 0.9 #Decay action_w loss if action loss is larger than policy gradient loss self.actor_optim.step() if self.num_critic_updates % self.args.policy_ups_freq == 0: self.soft_update(self.actor_target, self.actor, self.tau) self.soft_update(self.critic_target, self.critic, self.tau) def soft_update(self, target, source, tau): """Soft update from target network to source Parameters: target (object): A pytorch model source (object): A pytorch model tau (float): Tau parameter Returns: None """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def hard_update(self, target, source): """Hard update (clone) from target network to source Parameters: target (object): A pytorch model source (object): A pytorch model Returns: None """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def shape_dpp(self, critic, actor, state, sensor_model): Q1, _, val = critic((state), actor((state))) original_T = Q1 - val all_adv = [original_T] state = utils.to_numpy(state.cpu()) #mid_index = int(180 / self.args.angle_res) coupling = self.args.coupling max_ind = int(360 / self.args.angle_res) perturb_index = [ np.argwhere(state[i, 0:max_ind] != -1).flatten() for i in range(len(state)) ] for i, entry in enumerate(perturb_index): np.random.shuffle(entry) if len(entry) < coupling: perturb_index[i] = np.tile(entry, (coupling, 1)).flatten() for coupling_mag in range(coupling): empty_ind = [int(entry[coupling_mag]) for entry in perturb_index] if sensor_model == 'density': for i, ind in enumerate(empty_ind): state[i, ind] = 1.0 elif sensor_model == 'closets': for i, ind in enumerate(empty_ind): state[i, ind] = 1.0 shaped_state = utils.to_tensor(state).cuda() Q1, _, val = critic((shaped_state), actor((shaped_state))) adv = (Q1 - val) / (coupling_mag + 1) all_adv.append(adv) all_adv = torch.cat(all_adv, 1) dpp_max = torch.max(all_adv, 1)[0].unsqueeze(1) with torch.no_grad(): normalizer = dpp_max / original_T return original_T * normalizer
class TD3(object): """Classes implementing TD3 and DDPG off-policy learners Parameters: args (object): Parameter class """ def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, init_w = True): self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id; self.actualize = actualize; self.use_gpu = use_gpu self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag, 'alz_score'+savetag,'alz_policy'+savetag], '.csv', save_iteration=1000, conv_size=1000) #Initialize actors self.policy = Actor(state_dim, action_dim, hidden_size, policy_type='DeterministicPolicy') if init_w: self.policy.apply(utils.init_weights) self.policy_target = Actor(state_dim, action_dim, hidden_size, policy_type='DeterministicPolicy') utils.hard_update(self.policy_target, self.policy) self.policy_optim = Adam(self.policy.parameters(), actor_lr) self.critic = QNetwork(state_dim, action_dim,hidden_size) if init_w: self.critic.apply(utils.init_weights) self.critic_target = QNetwork(state_dim, action_dim, hidden_size) utils.hard_update(self.critic_target, self.critic) self.critic_optim = Adam(self.critic.parameters(), critic_lr) if actualize: self.ANetwork = ActualizationNetwork(state_dim, action_dim, hidden_size) if init_w: self.ANetwork.apply(utils.init_weights) self.actualize_optim = Adam(self.ANetwork.parameters(), critic_lr) self.actualize_lr = 0.2 if use_gpu: self.ANetwork.cuda() self.loss = nn.MSELoss() if use_gpu: self.policy_target.cuda(); self.critic_target.cuda(); self.policy.cuda(); self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker #self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q = {'min':None, 'max': None, 'mean':None, 'std':None} self.alz_score = {'min':None, 'max': None, 'mean':None, 'std':None} self.alz_policy = {'min':None, 'max': None, 'mean':None, 'std':None} #self.val = {'min':None, 'max': None, 'mean':None, 'std':None} #self.value_loss = {'min':None, 'max': None, 'mean':None, 'std':None} def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, global_reward, num_epoch=1, **kwargs): """Runs a step of Bellman upodate and policy gradient using a batch of experiences Parameters: state_batch (tensor): Current States next_state_batch (tensor): Next States action_batch (tensor): Actions reward_batch (tensor): Rewards done_batch (tensor): Done batch num_epoch (int): Number of learning iteration to run with the same data Returns: None """ if isinstance(state_batch, list): state_batch = torch.cat(state_batch); next_state_batch = torch.cat(next_state_batch); action_batch = torch.cat(action_batch); reward_batch = torch.cat(reward_batch). done_batch = torch.cat(done_batch); global_reward = torch.cat(global_reward) for _ in range(num_epoch): ########### CRITIC UPDATE #################### #Compute next q-val, next_v and target with torch.no_grad(): #Policy Noise policy_noise = np.random.normal(0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1])) policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip']) #Compute next action_bacth next_action_batch = self.policy_target.clean_action(next_state_batch, return_only_action=True) + policy_noise.cuda() if self.use_gpu else policy_noise next_action_batch = torch.clamp(next_action_batch, -1, 1) #Compute Q-val and value of next state masking by done q1, q2 = self.critic_target.forward(next_state_batch, next_action_batch) q1 = (1 - done_batch) * q1 q2 = (1 - done_batch) * q2 #next_val = (1 - done_batch) * next_val #Select which q to use as next-q (depends on algo) if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min': next_q = torch.min(q1, q2) elif self.algo_name == 'DDPG': next_q = q1 elif self.algo_name == 'TD3_max': next_q = torch.max(q1, q2) #Compute target q and target val target_q = reward_batch + (self.gamma * next_q) #if self.args.use_advantage: target_val = reward_batch + (self.gamma * next_val) if self.actualize: ##########Actualization Network Update current_Ascore = self.ANetwork.forward(state_batch, action_batch) utils.compute_stats(current_Ascore, self.alz_score) target_Ascore = (self.actualize_lr) * (global_reward * 10.0) + (1 - self.actualize_lr) * current_Ascore.detach() actualize_loss = self.loss(target_Ascore, current_Ascore).mean() self.critic_optim.zero_grad() current_q1, current_q2 = self.critic.forward((state_batch), (action_batch)) utils.compute_stats(current_q1, self.q) dt = self.loss(current_q1, target_q) # if self.args.use_advantage: # dt = dt + self.loss(current_val, target_val) # utils.compute_stats(current_val, self.val) if self.algo_name == 'TD3' or self.algo_name == 'TD3_max': dt = dt + self.loss(current_q2, target_q) utils.compute_stats(dt, self.q_loss) # if self.args.critic_constraint: # if dt.item() > self.args.critic_constraint_w: # dt = dt * (abs(self.args.critic_constraint_w / dt.item())) dt.backward() self.critic_optim.step() self.num_critic_updates += 1 if self.actualize: self.actualize_optim.zero_grad() actualize_loss.backward() self.actualize_optim.step() #Delayed Actor Update if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: actor_actions = self.policy.clean_action(state_batch, return_only_action=False) # # Trust Region constraint # if self.args.trust_region_actor: # with torch.no_grad(): old_actor_actions = self.actor_target.forward(state_batch) # actor_actions = action_batch - old_actor_actions Q1, Q2 = self.critic.forward(state_batch, actor_actions) # if self.args.use_advantage: policy_loss = -(Q1 - val) policy_loss = -Q1 utils.compute_stats(-policy_loss,self.policy_loss) policy_loss = policy_loss.mean() ###Actualzie Policy Update if self.actualize: A1 = self.ANetwork.forward(state_batch, actor_actions) utils.compute_stats(A1, self.alz_policy) policy_loss += -A1.mean()*0.1 self.policy_optim.zero_grad() policy_loss.backward(retain_graph=True) #nn.utils.clip_grad_norm_(self.actor.parameters(), 10) # if self.args.action_loss: # action_loss = torch.abs(actor_actions-0.5) # utils.compute_stats(action_loss, self.action_loss) # action_loss = action_loss.mean() * self.args.action_loss_w # action_loss.backward() # #if self.action_loss[-1] > self.policy_loss[-1]: self.args.action_loss_w *= 0.9 #Decay action_w loss if action loss is larger than policy gradient loss self.policy_optim.step() # if self.args.hard_update: # if self.num_critic_updates % self.args.hard_update_freq == 0: # if self.num_critic_updates % self.args.policy_ups_freq == 0: self.hard_update(self.actor_target, self.actor) # self.hard_update(self.critic_target, self.critic) if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: utils.soft_update(self.policy_target, self.policy, self.tau) utils.soft_update(self.critic_target, self.critic, self.tau) self.total_update += 1 if self.agent_id == 0: self.tracker.update([self.q['mean'], self.q_loss['mean'], self.policy_loss['mean'],self.alz_score['mean'], self.alz_policy['mean']] ,self.total_update)
class PPO(object): """Classes implementing TD3 and DDPG off-policy learners Parameters: args (object): Parameter class """ def __init__(self, args): self.args = args self.actor = Actor(args) if args.init_w: self.actor.apply(utils.init_weights) self.actor_target = Actor(args) self.optim = Adam(self.actor.parameters(), lr=5e-4) self.vfunc = ValueFunc(args) if args.init_w: self.vfunc.apply(utils.init_weights) self.gamma = args.gamma self.loss = nn.SmoothL1Loss() #nn.MSELoss() #self.actor.cuda(); self.vfunc.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.critic_loss = {'mean': []} self.q = {'min': [], 'max': [], 'mean': [], 'std': []} self.val = {'min': [], 'max': [], 'mean': [], 'std': []} def compute_gae(self, trajectory, gamma=0.99, tau=0.95): with torch.no_grad(): values = [] next_values = [] rewards = [] masks = [] states = [] actions = [] for entry in trajectory: states.append(torch.tensor(entry[0])) actions.append(torch.tensor(entry[1])) values.append(self.vfunc(torch.Tensor(entry[0]))) rewards.append(torch.Tensor(entry[3])) masks.append(torch.Tensor(entry[5])) values.append(self.vfunc(torch.Tensor(entry[2]))) gae = 0.0 returns = [] for step in reversed(range(len(rewards))): delta = rewards[step] + gamma * values[ step + 1] * masks[step] - values[step] gae = delta + gamma * tau * masks[step] * gae returns.insert(0, gae + values[step]) return states, actions, values, returns def compute_stats(self, tensor, tracker): """Computes stats from intermediate tensors Parameters: tensor (tensor): tensor tracker (object): logger Returns: None """ tracker['min'].append(torch.min(tensor).item()) tracker['max'].append(torch.max(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) def update_parameters(self, states, actions, log_probs, returns, advantages, ppo_epochs=8, mini_batch_size=128, clip_param=0.2): """Runs a step of Bellman upodate and policy gradient using a batch of experiences Parameters: state_batch (tensor): Current States next_state_batch (tensor): Next States action_batch (tensor): Actions reward_batch (tensor): Rewards done_batch (tensor): Done batch num_epoch (int): Number of learning iteration to run with the same data Returns: None """ for _ in range(ppo_epochs): ind = random.sample(range(len(states)), mini_batch_size) mini_s = states[ind] mini_a = actions[ind] mini_ret = returns[ind] mini_adv = advantages[ind] #PPO Update new_action, value = self.actor(mini_s), self.vfunc(mini_s) ratio = mini_a - new_action surr1 = ratio * mini_adv surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * mini_adv actor_loss = -torch.min(surr1, surr2).mean() critic_loss = (mini_ret - value).pow(2).mean() loss = 0.5 * critic_loss + actor_loss self.optim.zero_grad() loss.backward() self.optim.step() def soft_update(self, target, source, tau): """Soft update from target network to source Parameters: target (object): A pytorch model source (object): A pytorch model tau (float): Tau parameter Returns: None """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def hard_update(self, target, source): """Hard update (clone) from target network to source Parameters: target (object): A pytorch model source (object): A pytorch model Returns: None """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class Off_Policy_Algo(object): """Classes implementing TD3 and DDPG off-policy learners Parameters: args (object): Parameter class """ def __init__(self, wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w=True): self.algo_name = algo_name self.gamma = gamma self.tau = tau self.HLoss = HLoss() #Initialize actors self.actor = Actor(state_dim, action_dim, wwid, self.algo_name) if init_w: self.actor.apply(utils.init_weights) self.actor_target = Actor(state_dim, action_dim, wwid, self.algo_name) utils.hard_update(self.actor_target, self.actor) self.actor_optim = Adam(self.actor.parameters(), actor_lr) self.critic = Critic(state_dim, action_dim) if init_w: self.critic.apply(utils.init_weights) self.critic_target = Critic(state_dim, action_dim) utils.hard_update(self.critic_target, self.critic) self.critic_optim = Adam(self.critic.parameters(), critic_lr) self.loss = nn.MSELoss() if torch.cuda.is_available(): self.actor_target.cuda() self.critic_target.cuda() self.actor.cuda() self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.critic_loss = {'mean': []} self.q = {'min': [], 'max': [], 'mean': [], 'std': []} self.val = {'min': [], 'max': [], 'mean': [], 'std': []} def save_net(self, path): torch.save(self.actor.state_dict(), path) def act(self, state): return self.actor(state) def share_memory(self): self.actor.share_memory() self.actor_target.share_memory() self.critic.share_memory() self.critic_target.share_memory() def compute_stats(self, tensor, tracker): """Computes stats from intermediate tensors Parameters: tensor (tensor): tensor tracker (object): logger Returns: None """ tracker['min'].append(torch.min(tensor).item()) tracker['max'].append(torch.max(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, num_epoch=1, **kwargs): """Runs a step of Bellman upodate and policy gradient using a batch of experiences Parameters: state_batch (tensor): Current States next_state_batch (tensor): Next States action_batch (tensor): Actions reward_batch (tensor): Rewards done_batch (tensor): Done batch num_epoch (int): Number of learning iteration to run with the same data Returns: None """ if isinstance(state_batch, list): state_batch = torch.cat(state_batch) next_state_batch = torch.cat(next_state_batch) action_batch = torch.cat(action_batch) reward_batch = torch.cat(reward_batch).done_batch = torch.cat( done_batch) for _ in range(num_epoch): ########### CRITIC UPDATE #################### #Compute next q-val, next_v and target with torch.no_grad(): #Policy Noise policy_noise = np.random.normal( 0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1])) policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip']) #Compute next action_bacth #next_action_batch = self.actor_target.turn_max_into_onehot(self.actor_target.Gumbel_softmax_sample_distribution(next_state_batch, use_cuda=True))\ # if self.algo_name == 'dis' else self.actor_target.forward(next_state_batch) + policy_noise.cuda() #this should use one-hot from logits next_action_batch = self.actor_target.turn_max_into_onehot(self.actor_target.forward(next_state_batch)) \ if self.algo_name == 'dis' else self.actor_target.forward(next_state_batch) + policy_noise.cuda() # this should use one-hot from logits if random.random() < 0.0001: print('off_policy line 114, changed next action batch') next_action_batch = torch.clamp(next_action_batch, 0, 1) #Compute Q-val and value of next state masking by done q1, q2, _ = self.critic_target.forward(next_state_batch, next_action_batch) q1 = (1 - done_batch) * q1 q2 = (1 - done_batch) * q2 #Select which q to use as next-q (depends on algo) if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min' or self.algo_name == 'dis': next_q = torch.min(q1, q2) elif self.algo_name == 'DDPG': next_q = q1 elif self.algo_name == 'TD3_max': next_q = torch.max(q1, q2) #Compute target q and target val target_q = reward_batch + (self.gamma * next_q) self.critic_optim.zero_grad() current_q1, current_q2, current_val = self.critic.forward( (state_batch), (action_batch )) #here the action batch should be the soft version self.compute_stats(current_q1, self.q) dt = self.loss(current_q1, target_q) if self.algo_name == 'TD3' or self.algo_name == 'TD3_max' or self.algo_name == 'dis': dt = dt + self.loss(current_q2, target_q) self.critic_loss['mean'].append(dt.item()) #print(dt.item(), "off_policy_algo line 136") dt.backward() self.critic_optim.step() self.num_critic_updates += 1 #Delayed Actor Update if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: actor_actions = self.actor.Gumbel_softmax_sample_distribution(state_batch, use_cuda=True)\ if self.algo_name == 'dis' else self.actor.forward(state_batch) #actor_actions = self.actor.forward(state_batch) #if random.random() < 0.001: print('actor action changed') Q1, Q2, val = self.critic.forward(state_batch, actor_actions) # if self.args.use_advantage: policy_loss = -(Q1 - val) policy_loss = -Q1 + 0.1 * self.HLoss( actor_actions ) # HLoss is a single scalar, directly regularized logits? if random.random() < 0.0005: print('added entropy regularization, off_policy_algo 161') self.compute_stats(policy_loss, self.policy_loss) policy_loss = policy_loss.mean() #print(policy_loss, 'off_policy line 157') self.actor_optim.zero_grad() policy_loss.backward(retain_graph=True) self.actor_optim.step() #if random.random() <= 0.001: # self.test_actor_gradient_descent(state_batch) if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: utils.soft_update(self.actor_target, self.actor, self.tau) utils.soft_update(self.critic_target, self.critic, self.tau) def test_actor_gradient_descent(self, state_batch): #this method test if running gradient descent on the actor actually decrease the loss print("test_actor_gradient_descent, off_policy_algo line 179") for i in range(10): actor_actions = self.actor.forward(state_batch) print("logits_", self.actor.w_out(self.actor.logits(state_batch))[0]) print("action_batch", actor_actions[0]) Q1, Q2, val = self.critic.forward(state_batch, actor_actions) policy_loss = -Q1 policy_loss = policy_loss.mean() print("policy_loss at i = ", i, " is ", policy_loss) self.actor_optim.zero_grad() policy_loss.backward(retain_graph=True) print("gradient_", self.actor.f1.bias.grad[0]) self.actor_optim.step() print("bias_", self.actor.f1.bias[0])
class Off_Policy_Algo(object): """Classes implementing TD3 and DDPG off-policy learners Parameters: args (object): Parameter class """ def __init__(self, wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w = True): self.algo_name = algo_name; self.gamma = gamma; self.tau = tau #Initialize actors self.actor = Actor(state_dim, action_dim, wwid) if init_w: self.actor.apply(utils.init_weights) self.actor_target = Actor(state_dim, action_dim, wwid) utils.hard_update(self.actor_target, self.actor) self.actor_optim = Adam(self.actor.parameters(), actor_lr) self.critic = Critic(state_dim, action_dim) if init_w: self.critic.apply(utils.init_weights) self.critic_target = Critic(state_dim, action_dim) utils.hard_update(self.critic_target, self.critic) self.critic_optim = Adam(self.critic.parameters(), critic_lr) self.loss = nn.MSELoss() self.actor_target.cuda(); self.critic_target.cuda(); self.actor.cuda(); self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min':[], 'max': [], 'mean':[], 'std':[]} self.policy_loss = {'min':[], 'max': [], 'mean':[], 'std':[]} self.critic_loss = {'mean':[]} self.q = {'min':[], 'max': [], 'mean':[], 'std':[]} self.val = {'min':[], 'max': [], 'mean':[], 'std':[]} def compute_stats(self, tensor, tracker): """Computes stats from intermediate tensors Parameters: tensor (tensor): tensor tracker (object): logger Returns: None """ tracker['min'].append(torch.min(tensor).item()) tracker['max'].append(torch.max(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, num_epoch=1, **kwargs): """Runs a step of Bellman upodate and policy gradient using a batch of experiences Parameters: state_batch (tensor): Current States next_state_batch (tensor): Next States action_batch (tensor): Actions reward_batch (tensor): Rewards done_batch (tensor): Done batch num_epoch (int): Number of learning iteration to run with the same data Returns: None """ if isinstance(state_batch, list): state_batch = torch.cat(state_batch); next_state_batch = torch.cat(next_state_batch); action_batch = torch.cat(action_batch); reward_batch = torch.cat(reward_batch). done_batch = torch.cat(done_batch) for _ in range(num_epoch): ########### CRITIC UPDATE #################### #Compute next q-val, next_v and target with torch.no_grad(): #Policy Noise policy_noise = np.random.normal(0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1])) policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip']) #Compute next action_bacth next_action_batch = self.actor_target.forward(next_state_batch) + policy_noise.cuda() next_action_batch = torch.clamp(next_action_batch, 0,1) #Compute Q-val and value of next state masking by done q1, q2, _ = self.critic_target.forward(next_state_batch, next_action_batch) q1 = (1 - done_batch) * q1 q2 = (1 - done_batch) * q2 #Select which q to use as next-q (depends on algo) if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min': next_q = torch.min(q1, q2) elif self.algo_name == 'DDPG': next_q = q1 elif self.algo_name == 'TD3_max': next_q = torch.max(q1, q2) #Compute target q and target val target_q = reward_batch + (self.gamma * next_q) self.critic_optim.zero_grad() current_q1, current_q2, current_val = self.critic.forward((state_batch), (action_batch)) self.compute_stats(current_q1, self.q) dt = self.loss(current_q1, target_q) if self.algo_name == 'TD3' or self.algo_name == 'TD3_max': dt = dt + self.loss(current_q2, target_q) self.critic_loss['mean'].append(dt.item()) dt.backward() self.critic_optim.step() self.num_critic_updates += 1 #Delayed Actor Update if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: actor_actions = self.actor.forward(state_batch) Q1, Q2, val = self.critic.forward(state_batch, actor_actions) # if self.args.use_advantage: policy_loss = -(Q1 - val) policy_loss = -Q1 self.compute_stats(policy_loss,self.policy_loss) policy_loss = policy_loss.mean() self.actor_optim.zero_grad() policy_loss.backward(retain_graph=True) self.actor_optim.step() if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: utils.soft_update(self.actor_target, self.actor, self.tau) utils.soft_update(self.critic_target, self.critic, self.tau)
class TD3_DDPG(object): """Classes implementing TD3 and DDPG off-policy learners Parameters: args (object): Parameter class """ def __init__(self, args): self.args = args self.algo = args.algo self.actor = Actor(args) if args.init_w: self.actor.apply(utils.init_weights) self.actor_target = Actor(args) self.actor_optim = Adam(self.actor.parameters(), lr=5e-5) self.critic = Critic(args) if args.init_w: self.critic.apply(utils.init_weights) self.critic_target = Critic(args) self.critic_optim = Adam(self.critic.parameters(), lr=5e-4) self.gamma = args.gamma self.tau = self.args.tau self.loss = nn.MSELoss() self.hard_update( self.actor_target, self.actor) # Make sure target is with the same weight self.hard_update(self.critic_target, self.critic) self.actor_target.cuda() self.critic_target.cuda() self.actor.cuda() self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []} self.critic_loss = {'mean': []} self.q = {'min': [], 'max': [], 'mean': [], 'std': []} self.val = {'min': [], 'max': [], 'mean': [], 'std': []} def compute_stats(self, tensor, tracker): """Computes stats from intermediate tensors Parameters: tensor (tensor): tensor tracker (object): logger Returns: None """ tracker['min'].append(torch.min(tensor).item()) tracker['max'].append(torch.max(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) tracker['mean'].append(torch.mean(tensor).item()) def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, num_epoch=1): """Runs a step of Bellman upodate and policy gradient using a batch of experiences Parameters: state_batch (tensor): Current States next_state_batch (tensor): Next States action_batch (tensor): Actions reward_batch (tensor): Rewards done_batch (tensor): Done batch num_epoch (int): Number of learning iteration to run with the same data Returns: None """ if isinstance(state_batch, list): state_batch = torch.cat(state_batch) next_state_batch = torch.cat(next_state_batch) action_batch = torch.cat(action_batch) reward_batch = torch.cat(reward_batch).done_batch = torch.cat( done_batch) for _ in range(num_epoch): ########### CRITIC UPDATE #################### #Compute next q-val, next_v and target with torch.no_grad(): #Policy Noise policy_noise = np.random.normal( 0, self.args.policy_noise, (action_batch.size()[0], action_batch.size()[1])) policy_noise = torch.clamp(torch.Tensor(policy_noise), -self.args.policy_noise_clip, self.args.policy_noise_clip) #Compute next action_bacth next_action_batch = self.actor_target.forward( next_state_batch) + policy_noise.cuda() next_action_batch = torch.clamp(next_action_batch, 0, 1) #Compute Q-val and value of next state masking by done q1, q2, next_val = self.critic_target.forward( next_state_batch, next_action_batch) if self.args.use_done_mask: q1 = (1 - done_batch) * q1 q2 = (1 - done_batch) * q2 next_val = (1 - done_batch) * next_val #Clamp Q-vals if self.args.q_clamp != None: q1 = torch.clamp(q1, -self.args.q_clamp, self.args.q_clamp) q1 = torch.clamp(q2, -self.args.q_clamp, self.args.q_clamp) #Select which q to use as next-q (depends on algo) if self.algo == 'TD3' or self.algo == 'TD3_actor_min': next_q = torch.min(q1, q2) elif self.algo == 'DDPG': next_q = q1 elif self.algo == 'TD3_max': next_q = torch.max(q1, q2) #Compute target q and target val target_q = reward_batch + (self.gamma * next_q) if self.args.use_advantage: target_val = reward_batch + (self.gamma * next_val) self.critic_optim.zero_grad() current_q1, current_q2, current_val = self.critic.forward( (state_batch), (action_batch)) self.compute_stats(current_q1, self.q) dt = self.loss(current_q1, target_q) if self.args.use_advantage: dt = dt + self.loss(current_val, target_val) self.compute_stats(current_val, self.val) if self.algo == 'TD3' or self.algo == 'TD3_max': dt = dt + self.loss(current_q2, target_q) self.critic_loss['mean'].append(dt.item()) if self.args.critic_constraint: if dt.item() > self.args.critic_constraint_w: dt = dt * (abs(self.args.critic_constraint_w / dt.item())) dt.backward() self.critic_optim.step() self.num_critic_updates += 1 #Delayed Actor Update if self.num_critic_updates % self.args.policy_ups_freq == 0: actor_actions = self.actor.forward(state_batch) # Trust Region constraint if self.args.trust_region_actor: with torch.no_grad(): old_actor_actions = self.actor_target.forward( state_batch) actor_actions = action_batch - old_actor_actions Q1, Q2, val = self.critic.forward(state_batch, actor_actions) if self.args.use_advantage: policy_loss = -(Q1 - val) else: policy_loss = -Q1 self.compute_stats(policy_loss, self.policy_loss) policy_loss = policy_loss.mean() self.actor_optim.zero_grad() policy_loss.backward(retain_graph=True) #nn.utils.clip_grad_norm_(self.actor.parameters(), 10) if self.args.action_loss: action_loss = torch.abs(actor_actions - 0.5) self.compute_stats(action_loss, self.action_loss) action_loss = action_loss.mean() * self.args.action_loss_w action_loss.backward() #if self.action_loss[-1] > self.policy_loss[-1]: self.args.action_loss_w *= 0.9 #Decay action_w loss if action loss is larger than policy gradient loss self.actor_optim.step() if self.args.hard_update: if self.num_critic_updates % self.args.hard_update_freq == 0: if self.num_critic_updates % self.args.policy_ups_freq == 0: self.hard_update(self.actor_target, self.actor) self.hard_update(self.critic_target, self.critic) else: if self.num_critic_updates % self.args.policy_ups_freq == 0: self.soft_update(self.actor_target, self.actor, self.tau) self.soft_update(self.critic_target, self.critic, self.tau) def soft_update(self, target, source, tau): """Soft update from target network to source Parameters: target (object): A pytorch model source (object): A pytorch model tau (float): Tau parameter Returns: None """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def hard_update(self, target, source): """Hard update (clone) from target network to source Parameters: target (object): A pytorch model source (object): A pytorch model Returns: None """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)