def __init__(self, actor_critic, value_loss_coef, entropy_coef, lr=None, eps=None, alpha=None, max_grad_norm=None, acktr=False): self.actor_critic = actor_critic self.acktr = acktr self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm # import pdb; pdb.set_trace() if acktr: self.optimizer = KFACOptimizer([actor_critic, actor_critic]) else: self.optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha)
def __init__(self, actor_critic, value_loss_coef, entropy_coef, lr=None, eps=None, alpha=None, max_grad_norm=None, acktr=False, gradient_noise=0.0): self.actor_critic = actor_critic self.acktr = acktr self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm if acktr: self.optimizer = KFACOptimizer(actor_critic) else: self.optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha) self.gradient_noise = gradient_noise
def __init__(self, actor_critic, value_loss_coef, entropy_coef, lr=None, lr_beta=None, reg_beta=None, eps=None, alpha=None, max_grad_norm=None, acktr=False): self.actor_critic = actor_critic self.acktr = acktr self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm if acktr: self.optimizer = KFACOptimizer(actor_critic) self.beta_actor_list = [] self.param_list = [] for name, param in actor_critic.named_parameters(): if "base.beta_net_actor" in name : self.beta_actor_list.append(param) else: self.param_list.append(param) else: # Pierre: separate learning rates for beta net and actor net self.optimizer = optim.RMSprop([{'params': self.param_list}, {'params': self.beta_actor_list, 'lr': lr_beta, 'weight_decay':reg_beta}], lr, eps=eps, alpha=alpha)
def __init__(self, actor_critic, value_loss_coef, entropy_coef, lr=None, eps=None, alpha=None, max_grad_norm=None, acktr=False, train_selfsup_attention=False): self.actor_critic = actor_critic self.acktr = acktr self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm self.train_selfsup_attention = train_selfsup_attention if acktr: self.optimizer = KFACOptimizer(actor_critic) else: self.optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha) if self.train_selfsup_attention: self.selfsup_attention_optimizer = optim.Adam( actor_critic.base.selfsup_attention.parameters(), 0.001)
def __init__(self, actor_critic, value_loss_coef, entropy_coef, lr=None, eps=None, alpha=None, max_grad_norm=None, acktr=False, path_recorder=None, cost_evaluator=None, arch_loss_coef=0): self.actor_critic = actor_critic self.acktr = acktr self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm self.path_recorder = path_recorder self.cost_evaluator = cost_evaluator self.arch_loss_coef = arch_loss_coef if acktr: self.optimizer = KFACOptimizer(actor_critic) else: self.optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha)
def init_optimizer(self): if self.acktr: self.optimizer = KFACOptimizer(self.actor_critic) else: self.optimizer = optim.RMSprop(self.actor_critic.parameters(), self.lr, eps=self.eps, alpha=self.alpha) self.schedulers = ExponentialLR(self.optimizer, self.lr_decay)
def __init__(self, actor_critic, value_loss_coef, entropy_coef, lr=None, filter_mem=None, eps=None, alpha=None, max_grad_norm=None, acktr=False): self.actor_critic = actor_critic self.acktr = acktr self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm if acktr: self.optimizer = KFACOptimizer(actor_critic) self.filter_list = [] self.param_list = [] for name, param in actor_critic.named_parameters(): if "base.filter_net" in name: self.filter_list.append(param) else: self.param_list.append(param) else: self.optimizer = optim.RMSprop([{ 'params': self.param_list }, { 'params': self.filter_list }], lr, eps=eps, alpha=alpha)
def __init__(self, actor_critic, value_loss_coef, entropy_coef, lr=None, lr_beta=None, reg_beta=None, delib_center=0.5, eps=None, alpha=None, max_grad_norm=None, acktr=False): self.actor_critic = actor_critic self.acktr = acktr self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm self.reg_beta = reg_beta self.delib_center = delib_center if acktr: self.optimizer = KFACOptimizer(actor_critic) self.beta_value_list = [] self.param_list = [] for name, param in actor_critic.named_parameters(): if "base.beta_value_net" in name : self.beta_value_list.append(param) else: self.param_list.append(param) else: self.optimizer = optim.RMSprop([{'params': self.param_list}, {'params': self.beta_value_list, 'lr': lr_beta}], lr, eps=eps, alpha=alpha)
class A2C_ACKTR(): def __init__(self, actor_critic, value_loss_coef, entropy_coef, lr=None, eps=None, alpha=None, max_grad_norm=None, acktr=False): self.actor_critic = actor_critic self.acktr = acktr self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm if acktr: self.optimizer = KFACOptimizer(actor_critic) else: self.optimizer = optim.RMSprop( actor_critic.parameters(), lr, eps=eps, alpha=alpha) def update(self, rollouts): obs_shape = rollouts.obs.size()[2:] action_shape = rollouts.actions.size()[-1] num_steps, num_processes, _ = rollouts.rewards.size() values, action_log_probs, dist_entropy, _, _ = self.actor_critic.evaluate_actions( rollouts.obs[:-1].view(-1, *obs_shape), rollouts.recurrent_hidden_states[0].view( -1, self.actor_critic.recurrent_hidden_state_size), rollouts.masks[:-1].view(-1, 1), rollouts.actions.view(-1, action_shape)) values = values.view(num_steps, num_processes, 1) action_log_probs = action_log_probs.view(num_steps, num_processes, 1) advantages = rollouts.returns[:-1] - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * action_log_probs).mean() if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0: # Compute fisher, see Martens 2014 self.actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = torch.randn(values.size()) if values.is_cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss self.optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) self.optimizer.acc_stats = False self.optimizer.zero_grad() (value_loss * self.value_loss_coef + action_loss - dist_entropy * self.entropy_coef).backward() if self.acktr == False: nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm) self.optimizer.step() return value_loss.item(), action_loss.item(), dist_entropy.item()
class A2C_ACKTR(): def __init__(self, actor_critic, value_loss_coef, entropy_coef, lr=None, filter_mem=None, eps=None, alpha=None, max_grad_norm=None, acktr=False): self.actor_critic = actor_critic self.acktr = acktr self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm if acktr: self.optimizer = KFACOptimizer(actor_critic) self.filter_list = [] self.param_list = [] for name, param in actor_critic.named_parameters(): if "base.filter_net" in name: self.filter_list.append(param) else: self.param_list.append(param) else: self.optimizer = optim.RMSprop([{ 'params': self.param_list }, { 'params': self.filter_list }], lr, eps=eps, alpha=alpha) def update(self, rollouts, value_prev_eval, filter_mem_latent_eval, filter_type, filter_mem): obs_shape = rollouts.obs.size()[2:] action_shape = rollouts.actions.size()[-1] num_steps, num_processes, _ = rollouts.rewards.size() values, action_log_probs, dist_entropy, _, value_prev_eval, filter_mem_latent_eval, att_list, grad_term = self.actor_critic.evaluate_actions( rollouts.obs[:-1], rollouts.recurrent_hidden_states[0], rollouts.masks[:-1], rollouts.actions, rollouts.att_target, value_prev_eval=value_prev_eval, filter_mem_latent_eval=filter_mem_latent_eval, filter_type=filter_type, filter_mem=filter_mem) values = values.view(num_steps, num_processes, 1) action_log_probs = action_log_probs.view(num_steps, num_processes, 1) #advantages = rollouts.returns[:-1] - grad_term.unsqueeze(2) #value_loss = advantages.pow(2).mean() advantages = rollouts.returns[:-1] - values value_loss = ( -2 * advantages * grad_term.unsqueeze(2)).mean() # credit-TD error, assignment-FIR action_loss = -(advantages.detach() * action_log_probs).mean() if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0: # Sampled fisher, see Martens 2014 self.actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = torch.randn(values.size()) if values.is_cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss self.optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) self.optimizer.acc_stats = False self.optimizer.zero_grad() (value_loss * self.value_loss_coef + action_loss - dist_entropy * self.entropy_coef).backward(retain_graph=True) if self.acktr == False: nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm) self.optimizer.step() return value_loss.item(), action_loss.item(), dist_entropy.item( ), value_prev_eval, filter_mem_latent_eval, att_list
class A2C_ACKTR(): def __init__(self, actor_critic, value_loss_coef, entropy_coef, lr=None, lr_beta=None, reg_beta=None, eps=None, alpha=None, max_grad_norm=None, acktr=False): self.actor_critic = actor_critic self.acktr = acktr self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm if acktr: self.optimizer = KFACOptimizer(actor_critic) self.beta_actor_list = [] self.param_list = [] for name, param in actor_critic.named_parameters(): if "base.beta_net_actor" in name : self.beta_actor_list.append(param) else: self.param_list.append(param) else: # Pierre: separate learning rates for beta net and actor net self.optimizer = optim.RMSprop([{'params': self.param_list}, {'params': self.beta_actor_list, 'lr': lr_beta, 'weight_decay':reg_beta}], lr, eps=eps, alpha=alpha) def update(self, rollouts, eval_prev_mean): # Nishanth: modified shape to make compatible to the function call obs_shape = rollouts.obs.size()[2:] action_shape = rollouts.actions.size()[-1] num_steps, num_processes, _ = rollouts.rewards.size() values, action_log_probs, dist_entropy, _, eval_prev_mean = self.actor_critic.evaluate_actions( rollouts.obs[:-1], rollouts.recurrent_hidden_states[0], rollouts.masks[:-1], rollouts.actions, eval_prev_mean) action_log_probs = action_log_probs.view(num_steps, num_processes, 1) advantages = rollouts.returns[:-1] - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * action_log_probs).mean() if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0: # Sampled fisher, see Martens 2014 self.actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = torch.randn(values.size()) if values.is_cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss self.optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) self.optimizer.acc_stats = False self.optimizer.zero_grad() (value_loss * self.value_loss_coef + action_loss - dist_entropy * self.entropy_coef).backward(retain_graph=True) if self.acktr == False: nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm) self.optimizer.step() return value_loss.item(), action_loss.item(), dist_entropy.item(), eval_prev_mean
class A2C_ACKTR(): def __init__(self, actor_critic, value_loss_coef, entropy_coef, lr=None, lr_decay=None, eps=None, alpha=None, max_grad_norm=None, acktr=False): self.algo = 'a2c' self.actor_critic = actor_critic self.acktr = acktr self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.lr, self.lr_decay = lr, lr_decay self.eps, self.alpha = eps, alpha self.init_optimizer() self.max_grad_norm = max_grad_norm def init_optimizer(self): if self.acktr: self.optimizer = KFACOptimizer(self.actor_critic) else: self.optimizer = optim.RMSprop(self.actor_critic.parameters(), self.lr, eps=self.eps, alpha=self.alpha) self.schedulers = ExponentialLR(self.optimizer, self.lr_decay) def update(self, rollouts): obs_shape = rollouts.obs.size()[2:] feat_dim = rollouts.feats.size()[-1] action_shape = rollouts.actions.size()[-1] num_steps, num_processes, _ = rollouts.rewards.size() values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( rollouts.obs[:-1].view(-1, *obs_shape), rollouts.feats[:-1].view(-1, feat_dim), rollouts.recurrent_hidden_states[0].view( -1, self.actor_critic.recurrent_hidden_state_size), rollouts.active[:-1].view(-1), rollouts.actions.view(-1, action_shape)) loss_mask = rollouts.active[:-1] # srsohn divider = loss_mask.sum().detach() values = values.view(num_steps, num_processes, 1) action_log_probs = action_log_probs.view(num_steps, num_processes, 1) advantages = (rollouts.returns[:-1] - values) * loss_mask value_loss = advantages.pow(2).sum() / divider action_loss = -(advantages.detach() * action_log_probs).sum() / divider dist_entropy /= divider if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0: # Sampled fisher, see Martens 2014 self.actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = torch.randn(values.size()) if values.is_cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss self.optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) self.optimizer.acc_stats = False self.optimizer.zero_grad() loss = (value_loss * self.value_loss_coef + action_loss - dist_entropy * self.entropy_coef) loss.backward() if self.acktr == False: nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm) self.optimizer.step() self.schedulers.step() # lr decay return value_loss.item() * self.value_loss_coef, action_loss.item( ), dist_entropy.item()
class A2C_ACKTR(): def __init__(self, actor_critic, value_loss_coef, entropy_coef, lr=None, lr_beta=None, reg_beta=None, delib_center=0.5, eps=None, alpha=None, max_grad_norm=None, acktr=False): self.actor_critic = actor_critic self.acktr = acktr self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm self.reg_beta = reg_beta self.delib_center = delib_center if acktr: self.optimizer = KFACOptimizer(actor_critic) self.beta_value_list = [] self.param_list = [] for name, param in actor_critic.named_parameters(): if "base.beta_value_net" in name : self.beta_value_list.append(param) else: self.param_list.append(param) else: self.optimizer = optim.RMSprop([{'params': self.param_list}, {'params': self.beta_value_list, 'lr': lr_beta}], lr, eps=eps, alpha=alpha) def update(self, rollouts, eval_prev_value, eval_prev_rew): obs_shape = rollouts.obs.size()[2:] action_shape = rollouts.actions.size()[-1] num_steps, num_processes, _ = rollouts.rewards.size() rewards = torch.cat((eval_prev_rew.unsqueeze(0), rollouts.rewards.squeeze(2))) values, action_log_probs, dist_entropy, _ , eval_prev_value, betas = self.actor_critic.evaluate_actions( rollouts.obs[:-1], rollouts.recurrent_hidden_states[0], rollouts.masks[:-1], rollouts.actions, eval_prev_value=eval_prev_value, eval_prev_rew=rewards) values = values.view(num_steps, num_processes, 1) action_log_probs = action_log_probs.view(num_steps, num_processes, 1) advantages = rollouts.returns[:-1] - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * action_log_probs).mean() if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0: # Sampled fisher, see Martens 2014 self.actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = torch.randn(values.size()) if values.is_cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss self.optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) self.optimizer.acc_stats = False if self.acktr == False: nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm) if self.reg_beta > 0: target_beta = torch.zeros_like(betas).fill_(self.delib_center) delib_loss = F.mse_loss(betas, target_beta) else: delib_loss = torch.zeros_like(value_loss) self.optimizer.zero_grad() (value_loss * self.value_loss_coef + action_loss - dist_entropy * self.entropy_coef + self.reg_beta * delib_loss).backward(retain_graph=True) self.optimizer.step() return value_loss.item(), action_loss.item(), dist_entropy.item(), eval_prev_value, delib_loss.item(), rewards[-1,:]
class A2C_ACKTR(): def __init__(self, actor_critic, value_loss_coef, entropy_coef, lr=None, eps=None, alpha=None, max_grad_norm=None, acktr=False, path_recorder=None, cost_evaluator=None, arch_loss_coef=0): self.actor_critic = actor_critic self.acktr = acktr self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm self.path_recorder = path_recorder self.cost_evaluator = cost_evaluator self.arch_loss_coef = arch_loss_coef if acktr: self.optimizer = KFACOptimizer(actor_critic) else: self.optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha) def update(self, rollouts): obs_shape = rollouts.obs.size()[2:] action_shape = rollouts.actions.size()[-1] num_steps, num_processes, _ = rollouts.rewards.size() values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( rollouts.obs[:-1].view(-1, *obs_shape), rollouts.recurrent_hidden_states[0].view( -1, self.actor_critic.recurrent_hidden_state_size), rollouts.masks[:-1].view(-1, 1), rollouts.actions.view(-1, action_shape)) values = values.view(num_steps, num_processes, 1) action_log_probs = action_log_probs.view(num_steps, num_processes, 1) advantages = rollouts.returns[:-1] - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * action_log_probs).mean() ### ARCH LOSS sampled, pruned = self.path_recorder.get_architectures( self.actor_critic.base.base.out_nodes) costs_s = self.cost_evaluator.get_costs(sampled) # Sampled cost costs_p = self.cost_evaluator.get_costs(pruned) # Pruned cost stacked_log_probas = torch.stack( self.actor_critic.base.base.log_probas) arch_reward = (value_loss * self.value_loss_coef + action_loss) - self.arch_loss_coef * costs_p.mean() arch_loss = -(arch_reward * stacked_log_probas).mean() # print('Sampled={}, pruned={}'.format(costs_s, costs_p)) ### if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0: # Sampled fisher, see Martens 2014 self.actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = torch.randn(values.size()) if values.is_cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss self.optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) self.optimizer.acc_stats = False self.optimizer.zero_grad() # print('Params: {}'.format(self.actor_critic.base.probas)) # print('Params: {}'.format(self.actor_critic.base.base.probas)) (value_loss * self.value_loss_coef + action_loss - dist_entropy * self.entropy_coef + arch_loss).backward() if self.acktr == False: nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm) self.optimizer.step() return value_loss.item(), action_loss.item(), dist_entropy.item()