Example #1
0
    def __init__(self,
                 actor_critic,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 eps=None,
                 alpha=None,
                 max_grad_norm=None,
                 acktr=False):

        self.actor_critic = actor_critic
        self.acktr = acktr

        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef

        self.max_grad_norm = max_grad_norm
        # import pdb; pdb.set_trace()
        if acktr:
            self.optimizer = KFACOptimizer([actor_critic, actor_critic])
        else:
            self.optimizer = optim.RMSprop(actor_critic.parameters(),
                                           lr,
                                           eps=eps,
                                           alpha=alpha)
    def __init__(self,
                 actor_critic,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 eps=None,
                 alpha=None,
                 max_grad_norm=None,
                 acktr=False,
                 gradient_noise=0.0):

        self.actor_critic = actor_critic
        self.acktr = acktr

        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef

        self.max_grad_norm = max_grad_norm

        if acktr:
            self.optimizer = KFACOptimizer(actor_critic)
        else:
            self.optimizer = optim.RMSprop(actor_critic.parameters(),
                                           lr,
                                           eps=eps,
                                           alpha=alpha)
        self.gradient_noise = gradient_noise
Example #3
0
    def __init__(self,
                 actor_critic,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 lr_beta=None,
                 reg_beta=None,
                 eps=None,
                 alpha=None,
                 max_grad_norm=None,
                 acktr=False):

        self.actor_critic = actor_critic
        self.acktr = acktr

        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef

        self.max_grad_norm = max_grad_norm

        if acktr:
            self.optimizer = KFACOptimizer(actor_critic)

        self.beta_actor_list = []
        self.param_list = []
        for name, param in actor_critic.named_parameters():
            if "base.beta_net_actor" in name :
                self.beta_actor_list.append(param)
            else:
                self.param_list.append(param)
        else:
            # Pierre: separate learning rates for beta net and actor net
            self.optimizer = optim.RMSprop([{'params': self.param_list},
                 {'params': self.beta_actor_list, 'lr': lr_beta, 'weight_decay':reg_beta}], lr, eps=eps, alpha=alpha)
    def __init__(self,
                 actor_critic,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 eps=None,
                 alpha=None,
                 max_grad_norm=None,
                 acktr=False,
                 train_selfsup_attention=False):

        self.actor_critic = actor_critic
        self.acktr = acktr
        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef
        self.max_grad_norm = max_grad_norm
        self.train_selfsup_attention = train_selfsup_attention

        if acktr:
            self.optimizer = KFACOptimizer(actor_critic)
        else:
            self.optimizer = optim.RMSprop(actor_critic.parameters(),
                                           lr,
                                           eps=eps,
                                           alpha=alpha)

        if self.train_selfsup_attention:
            self.selfsup_attention_optimizer = optim.Adam(
                actor_critic.base.selfsup_attention.parameters(), 0.001)
    def __init__(self,
                 actor_critic,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 eps=None,
                 alpha=None,
                 max_grad_norm=None,
                 acktr=False,
                 path_recorder=None,
                 cost_evaluator=None,
                 arch_loss_coef=0):

        self.actor_critic = actor_critic
        self.acktr = acktr

        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef

        self.max_grad_norm = max_grad_norm

        self.path_recorder = path_recorder
        self.cost_evaluator = cost_evaluator
        self.arch_loss_coef = arch_loss_coef

        if acktr:
            self.optimizer = KFACOptimizer(actor_critic)
        else:
            self.optimizer = optim.RMSprop(actor_critic.parameters(),
                                           lr,
                                           eps=eps,
                                           alpha=alpha)
Example #6
0
 def init_optimizer(self):
     if self.acktr:
         self.optimizer = KFACOptimizer(self.actor_critic)
     else:
         self.optimizer = optim.RMSprop(self.actor_critic.parameters(),
                                        self.lr,
                                        eps=self.eps,
                                        alpha=self.alpha)
         self.schedulers = ExponentialLR(self.optimizer, self.lr_decay)
Example #7
0
    def __init__(self,
                 actor_critic,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 filter_mem=None,
                 eps=None,
                 alpha=None,
                 max_grad_norm=None,
                 acktr=False):

        self.actor_critic = actor_critic
        self.acktr = acktr

        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef

        self.max_grad_norm = max_grad_norm

        if acktr:
            self.optimizer = KFACOptimizer(actor_critic)

        self.filter_list = []
        self.param_list = []
        for name, param in actor_critic.named_parameters():
            if "base.filter_net" in name:
                self.filter_list.append(param)
            else:
                self.param_list.append(param)

        else:
            self.optimizer = optim.RMSprop([{
                'params': self.param_list
            }, {
                'params': self.filter_list
            }],
                                           lr,
                                           eps=eps,
                                           alpha=alpha)
Example #8
0
    def __init__(self,
                 actor_critic,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 lr_beta=None,
                 reg_beta=None,
                 delib_center=0.5,
                 eps=None,
                 alpha=None,
                 max_grad_norm=None,
                 acktr=False):

        self.actor_critic = actor_critic
        self.acktr = acktr

        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef

        self.max_grad_norm = max_grad_norm
        self.reg_beta = reg_beta

        self.delib_center = delib_center

        if acktr:
            self.optimizer = KFACOptimizer(actor_critic)

        self.beta_value_list = []
        self.param_list = []
        for name, param in actor_critic.named_parameters():
            if "base.beta_value_net" in name :
                self.beta_value_list.append(param)
            else:
                self.param_list.append(param)

        else:
            self.optimizer = optim.RMSprop([{'params': self.param_list},
                 {'params': self.beta_value_list, 'lr': lr_beta}], lr, eps=eps, alpha=alpha)
class A2C_ACKTR():
    def __init__(self,
                 actor_critic,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 eps=None,
                 alpha=None,
                 max_grad_norm=None,
                 acktr=False):

        self.actor_critic = actor_critic
        self.acktr = acktr

        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef

        self.max_grad_norm = max_grad_norm

        if acktr:
            self.optimizer = KFACOptimizer(actor_critic)
        else:
            self.optimizer = optim.RMSprop(
                actor_critic.parameters(), lr, eps=eps, alpha=alpha)

    def update(self, rollouts):
        obs_shape = rollouts.obs.size()[2:]
        action_shape = rollouts.actions.size()[-1]
        num_steps, num_processes, _ = rollouts.rewards.size()

        values, action_log_probs, dist_entropy, _, _ = self.actor_critic.evaluate_actions(
            rollouts.obs[:-1].view(-1, *obs_shape),
            rollouts.recurrent_hidden_states[0].view(
                -1, self.actor_critic.recurrent_hidden_state_size),
            rollouts.masks[:-1].view(-1, 1),
            rollouts.actions.view(-1, action_shape))

        values = values.view(num_steps, num_processes, 1)
        action_log_probs = action_log_probs.view(num_steps, num_processes, 1)

        advantages = rollouts.returns[:-1] - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(advantages.detach() * action_log_probs).mean()

        if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0:
            # Compute fisher, see Martens 2014
            self.actor_critic.zero_grad()
            pg_fisher_loss = -action_log_probs.mean()

            value_noise = torch.randn(values.size())
            if values.is_cuda:
                value_noise = value_noise.cuda()

            sample_values = values + value_noise
            vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean()

            fisher_loss = pg_fisher_loss + vf_fisher_loss
            self.optimizer.acc_stats = True
            fisher_loss.backward(retain_graph=True)
            self.optimizer.acc_stats = False

        self.optimizer.zero_grad()
        (value_loss * self.value_loss_coef + action_loss -
         dist_entropy * self.entropy_coef).backward()

        if self.acktr == False:
            nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
                                     self.max_grad_norm)

        self.optimizer.step()

        return value_loss.item(), action_loss.item(), dist_entropy.item()
Example #10
0
class A2C_ACKTR():
    def __init__(self,
                 actor_critic,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 filter_mem=None,
                 eps=None,
                 alpha=None,
                 max_grad_norm=None,
                 acktr=False):

        self.actor_critic = actor_critic
        self.acktr = acktr

        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef

        self.max_grad_norm = max_grad_norm

        if acktr:
            self.optimizer = KFACOptimizer(actor_critic)

        self.filter_list = []
        self.param_list = []
        for name, param in actor_critic.named_parameters():
            if "base.filter_net" in name:
                self.filter_list.append(param)
            else:
                self.param_list.append(param)

        else:
            self.optimizer = optim.RMSprop([{
                'params': self.param_list
            }, {
                'params': self.filter_list
            }],
                                           lr,
                                           eps=eps,
                                           alpha=alpha)

    def update(self, rollouts, value_prev_eval, filter_mem_latent_eval,
               filter_type, filter_mem):
        obs_shape = rollouts.obs.size()[2:]
        action_shape = rollouts.actions.size()[-1]
        num_steps, num_processes, _ = rollouts.rewards.size()

        values, action_log_probs, dist_entropy, _, value_prev_eval, filter_mem_latent_eval, att_list, grad_term = self.actor_critic.evaluate_actions(
            rollouts.obs[:-1],
            rollouts.recurrent_hidden_states[0],
            rollouts.masks[:-1],
            rollouts.actions,
            rollouts.att_target,
            value_prev_eval=value_prev_eval,
            filter_mem_latent_eval=filter_mem_latent_eval,
            filter_type=filter_type,
            filter_mem=filter_mem)

        values = values.view(num_steps, num_processes, 1)
        action_log_probs = action_log_probs.view(num_steps, num_processes, 1)

        #advantages = rollouts.returns[:-1] - grad_term.unsqueeze(2)
        #value_loss = advantages.pow(2).mean()
        advantages = rollouts.returns[:-1] - values
        value_loss = (
            -2 * advantages *
            grad_term.unsqueeze(2)).mean()  # credit-TD error, assignment-FIR
        action_loss = -(advantages.detach() * action_log_probs).mean()

        if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0:
            # Sampled fisher, see Martens 2014
            self.actor_critic.zero_grad()
            pg_fisher_loss = -action_log_probs.mean()

            value_noise = torch.randn(values.size())
            if values.is_cuda:
                value_noise = value_noise.cuda()

            sample_values = values + value_noise
            vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean()

            fisher_loss = pg_fisher_loss + vf_fisher_loss
            self.optimizer.acc_stats = True
            fisher_loss.backward(retain_graph=True)
            self.optimizer.acc_stats = False

        self.optimizer.zero_grad()
        (value_loss * self.value_loss_coef + action_loss -
         dist_entropy * self.entropy_coef).backward(retain_graph=True)

        if self.acktr == False:
            nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
                                     self.max_grad_norm)

        self.optimizer.step()

        return value_loss.item(), action_loss.item(), dist_entropy.item(
        ), value_prev_eval, filter_mem_latent_eval, att_list
Example #11
0
class A2C_ACKTR():
    def __init__(self,
                 actor_critic,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 lr_beta=None,
                 reg_beta=None,
                 eps=None,
                 alpha=None,
                 max_grad_norm=None,
                 acktr=False):

        self.actor_critic = actor_critic
        self.acktr = acktr

        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef

        self.max_grad_norm = max_grad_norm

        if acktr:
            self.optimizer = KFACOptimizer(actor_critic)

        self.beta_actor_list = []
        self.param_list = []
        for name, param in actor_critic.named_parameters():
            if "base.beta_net_actor" in name :
                self.beta_actor_list.append(param)
            else:
                self.param_list.append(param)
        else:
            # Pierre: separate learning rates for beta net and actor net
            self.optimizer = optim.RMSprop([{'params': self.param_list},
                 {'params': self.beta_actor_list, 'lr': lr_beta, 'weight_decay':reg_beta}], lr, eps=eps, alpha=alpha)

    def update(self, rollouts, eval_prev_mean):
        # Nishanth: modified shape to make compatible to the function call
        obs_shape = rollouts.obs.size()[2:]
        action_shape = rollouts.actions.size()[-1]
        num_steps, num_processes, _ = rollouts.rewards.size()
        
        values, action_log_probs, dist_entropy, _, eval_prev_mean = self.actor_critic.evaluate_actions(
            rollouts.obs[:-1],
            rollouts.recurrent_hidden_states[0],
            rollouts.masks[:-1],
            rollouts.actions,
            eval_prev_mean)

        action_log_probs = action_log_probs.view(num_steps, num_processes, 1)

        advantages = rollouts.returns[:-1] - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(advantages.detach() * action_log_probs).mean()

        if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0:
            # Sampled fisher, see Martens 2014
            self.actor_critic.zero_grad()
            pg_fisher_loss = -action_log_probs.mean()

            value_noise = torch.randn(values.size())
            if values.is_cuda:
                value_noise = value_noise.cuda()

            sample_values = values + value_noise
            vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean()

            fisher_loss = pg_fisher_loss + vf_fisher_loss
            self.optimizer.acc_stats = True
            fisher_loss.backward(retain_graph=True)
            self.optimizer.acc_stats = False

        self.optimizer.zero_grad()
        (value_loss * self.value_loss_coef + action_loss -
         dist_entropy * self.entropy_coef).backward(retain_graph=True)

        if self.acktr == False:
            nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
                                     self.max_grad_norm)

        self.optimizer.step()

        return value_loss.item(), action_loss.item(), dist_entropy.item(), eval_prev_mean
Example #12
0
class A2C_ACKTR():
    def __init__(self,
                 actor_critic,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 lr_decay=None,
                 eps=None,
                 alpha=None,
                 max_grad_norm=None,
                 acktr=False):

        self.algo = 'a2c'
        self.actor_critic = actor_critic
        self.acktr = acktr

        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef
        self.lr, self.lr_decay = lr, lr_decay
        self.eps, self.alpha = eps, alpha
        self.init_optimizer()

        self.max_grad_norm = max_grad_norm

    def init_optimizer(self):
        if self.acktr:
            self.optimizer = KFACOptimizer(self.actor_critic)
        else:
            self.optimizer = optim.RMSprop(self.actor_critic.parameters(),
                                           self.lr,
                                           eps=self.eps,
                                           alpha=self.alpha)
            self.schedulers = ExponentialLR(self.optimizer, self.lr_decay)

    def update(self, rollouts):
        obs_shape = rollouts.obs.size()[2:]
        feat_dim = rollouts.feats.size()[-1]
        action_shape = rollouts.actions.size()[-1]
        num_steps, num_processes, _ = rollouts.rewards.size()

        values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions(
            rollouts.obs[:-1].view(-1, *obs_shape),
            rollouts.feats[:-1].view(-1, feat_dim),
            rollouts.recurrent_hidden_states[0].view(
                -1, self.actor_critic.recurrent_hidden_state_size),
            rollouts.active[:-1].view(-1),
            rollouts.actions.view(-1, action_shape))

        loss_mask = rollouts.active[:-1]  # srsohn
        divider = loss_mask.sum().detach()
        values = values.view(num_steps, num_processes, 1)
        action_log_probs = action_log_probs.view(num_steps, num_processes, 1)

        advantages = (rollouts.returns[:-1] - values) * loss_mask
        value_loss = advantages.pow(2).sum() / divider

        action_loss = -(advantages.detach() * action_log_probs).sum() / divider

        dist_entropy /= divider

        if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0:
            # Sampled fisher, see Martens 2014
            self.actor_critic.zero_grad()
            pg_fisher_loss = -action_log_probs.mean()

            value_noise = torch.randn(values.size())
            if values.is_cuda:
                value_noise = value_noise.cuda()

            sample_values = values + value_noise
            vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean()

            fisher_loss = pg_fisher_loss + vf_fisher_loss
            self.optimizer.acc_stats = True
            fisher_loss.backward(retain_graph=True)
            self.optimizer.acc_stats = False

        self.optimizer.zero_grad()

        loss = (value_loss * self.value_loss_coef + action_loss -
                dist_entropy * self.entropy_coef)
        loss.backward()

        if self.acktr == False:
            nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
                                     self.max_grad_norm)

        self.optimizer.step()
        self.schedulers.step()  # lr decay

        return value_loss.item() * self.value_loss_coef, action_loss.item(
        ), dist_entropy.item()
Example #13
0
class A2C_ACKTR():
    def __init__(self,
                 actor_critic,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 lr_beta=None,
                 reg_beta=None,
                 delib_center=0.5,
                 eps=None,
                 alpha=None,
                 max_grad_norm=None,
                 acktr=False):

        self.actor_critic = actor_critic
        self.acktr = acktr

        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef

        self.max_grad_norm = max_grad_norm
        self.reg_beta = reg_beta

        self.delib_center = delib_center

        if acktr:
            self.optimizer = KFACOptimizer(actor_critic)

        self.beta_value_list = []
        self.param_list = []
        for name, param in actor_critic.named_parameters():
            if "base.beta_value_net" in name :
                self.beta_value_list.append(param)
            else:
                self.param_list.append(param)

        else:
            self.optimizer = optim.RMSprop([{'params': self.param_list},
                 {'params': self.beta_value_list, 'lr': lr_beta}], lr, eps=eps, alpha=alpha)

    def update(self, rollouts, eval_prev_value, eval_prev_rew):
        obs_shape = rollouts.obs.size()[2:]
        action_shape = rollouts.actions.size()[-1]
        num_steps, num_processes, _ = rollouts.rewards.size()
        rewards = torch.cat((eval_prev_rew.unsqueeze(0), rollouts.rewards.squeeze(2)))

        values, action_log_probs, dist_entropy, _ , eval_prev_value, betas = self.actor_critic.evaluate_actions(
            rollouts.obs[:-1],
            rollouts.recurrent_hidden_states[0],
            rollouts.masks[:-1],
            rollouts.actions,
            eval_prev_value=eval_prev_value,
            eval_prev_rew=rewards)

        values = values.view(num_steps, num_processes, 1)
        action_log_probs = action_log_probs.view(num_steps, num_processes, 1)

        advantages = rollouts.returns[:-1] - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(advantages.detach() * action_log_probs).mean()

        if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0:
            # Sampled fisher, see Martens 2014
            self.actor_critic.zero_grad()
            pg_fisher_loss = -action_log_probs.mean()

            value_noise = torch.randn(values.size())
            if values.is_cuda:
                value_noise = value_noise.cuda()

            sample_values = values + value_noise
            vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean()

            fisher_loss = pg_fisher_loss + vf_fisher_loss
            self.optimizer.acc_stats = True
            fisher_loss.backward(retain_graph=True)
            self.optimizer.acc_stats = False

        if self.acktr == False:
            nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
                                     self.max_grad_norm)

        if self.reg_beta > 0:
            target_beta = torch.zeros_like(betas).fill_(self.delib_center)
            delib_loss = F.mse_loss(betas, target_beta)
        else:
            delib_loss = torch.zeros_like(value_loss)

        self.optimizer.zero_grad()
        (value_loss * self.value_loss_coef + action_loss -
         dist_entropy * self.entropy_coef + self.reg_beta * delib_loss).backward(retain_graph=True)

        self.optimizer.step()

        return value_loss.item(), action_loss.item(), dist_entropy.item(), eval_prev_value, delib_loss.item(), rewards[-1,:]
class A2C_ACKTR():
    def __init__(self,
                 actor_critic,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 eps=None,
                 alpha=None,
                 max_grad_norm=None,
                 acktr=False,
                 path_recorder=None,
                 cost_evaluator=None,
                 arch_loss_coef=0):

        self.actor_critic = actor_critic
        self.acktr = acktr

        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef

        self.max_grad_norm = max_grad_norm

        self.path_recorder = path_recorder
        self.cost_evaluator = cost_evaluator
        self.arch_loss_coef = arch_loss_coef

        if acktr:
            self.optimizer = KFACOptimizer(actor_critic)
        else:
            self.optimizer = optim.RMSprop(actor_critic.parameters(),
                                           lr,
                                           eps=eps,
                                           alpha=alpha)

    def update(self, rollouts):
        obs_shape = rollouts.obs.size()[2:]
        action_shape = rollouts.actions.size()[-1]
        num_steps, num_processes, _ = rollouts.rewards.size()

        values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions(
            rollouts.obs[:-1].view(-1, *obs_shape),
            rollouts.recurrent_hidden_states[0].view(
                -1, self.actor_critic.recurrent_hidden_state_size),
            rollouts.masks[:-1].view(-1, 1),
            rollouts.actions.view(-1, action_shape))

        values = values.view(num_steps, num_processes, 1)
        action_log_probs = action_log_probs.view(num_steps, num_processes, 1)

        advantages = rollouts.returns[:-1] - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(advantages.detach() * action_log_probs).mean()

        ### ARCH LOSS

        sampled, pruned = self.path_recorder.get_architectures(
            self.actor_critic.base.base.out_nodes)
        costs_s = self.cost_evaluator.get_costs(sampled)  # Sampled cost
        costs_p = self.cost_evaluator.get_costs(pruned)  # Pruned cost
        stacked_log_probas = torch.stack(
            self.actor_critic.base.base.log_probas)
        arch_reward = (value_loss * self.value_loss_coef +
                       action_loss) - self.arch_loss_coef * costs_p.mean()
        arch_loss = -(arch_reward * stacked_log_probas).mean()
        # print('Sampled={}, pruned={}'.format(costs_s, costs_p))
        ###

        if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0:
            # Sampled fisher, see Martens 2014
            self.actor_critic.zero_grad()
            pg_fisher_loss = -action_log_probs.mean()

            value_noise = torch.randn(values.size())
            if values.is_cuda:
                value_noise = value_noise.cuda()

            sample_values = values + value_noise
            vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean()

            fisher_loss = pg_fisher_loss + vf_fisher_loss
            self.optimizer.acc_stats = True
            fisher_loss.backward(retain_graph=True)
            self.optimizer.acc_stats = False

        self.optimizer.zero_grad()
        # print('Params: {}'.format(self.actor_critic.base.probas))
        # print('Params: {}'.format(self.actor_critic.base.base.probas))

        (value_loss * self.value_loss_coef + action_loss -
         dist_entropy * self.entropy_coef + arch_loss).backward()

        if self.acktr == False:
            nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
                                     self.max_grad_norm)

        self.optimizer.step()

        return value_loss.item(), action_loss.item(), dist_entropy.item()