Esempio n. 1
0
    def learn(self):
        assert hasattr(self, 'q1_optim') and hasattr(self, 'q2_optim') and\
            hasattr(self, 'policy_optim') and hasattr(self, 'alpha_optim')

        self.learning_steps += 1

        # if self.use_per:
        #     batch, weights = self.memory.sample(self.batch_size)
        # else:
        #     batch = self.memory.sample(self.batch_size)
        #     # Set priority weights to 1 when we don't use PER.
        #     weights = 1.
        batch = self.prep_minibatch()
        weights = 1.

        q1_loss, q2_loss, errors, mean_q1, mean_q2 = \
            self.calc_critic_loss(batch, weights)
        policy_loss, entropies = self.calc_policy_loss(batch, weights)
        entropy_loss = self.calc_entropy_loss(entropies, weights)

        update_params(self.q1_optim, q1_loss)
        update_params(self.q2_optim, q2_loss)
        update_params(self.policy_optim, policy_loss)
        update_params(self.alpha_optim, entropy_loss)

        self.alpha = self.log_alpha.exp()

        if self.use_per:
            self.memory.update_priority(errors)

        if self.learning_steps % self.log_interval == 0:
            self.writer.add_scalar('loss/Q1',
                                   q1_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('loss/Q2',
                                   q2_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('loss/policy',
                                   policy_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('loss/alpha',
                                   entropy_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('stats/alpha',
                                   self.alpha.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('stats/mean_Q1', mean_q1,
                                   self.learning_steps)
            self.writer.add_scalar('stats/mean_Q2', mean_q2,
                                   self.learning_steps)
            self.writer.add_scalar('stats/entropy',
                                   entropies.detach().mean().item(),
                                   self.learning_steps)
Esempio n. 2
0
    def rad_learn(self):
        assert hasattr(self, 'q1_optim') and hasattr(self, 'q2_optim') and\
            hasattr(self, 'policy_optim') and hasattr(self, 'alpha_optim')

        augs_funcs = {}
        aug_to_func = {
            'crop': rad.random_crop,
            'grayscale': rad.random_grayscale,
            'cutout': rad.random_cutout,
            'cutout_color': rad.random_cutout_color,
            'flip': rad.random_flip,
            'rotate': rad.random_rotation,
            'rand_conv': rad.random_convolution,
            'color_jitter': rad.random_color_jitter,
            'translate': rad.random_translate,
            'no_aug': rad.no_aug,
        }
        aug_name = "crop"
        augs_funcs[aug_name] = aug_to_func[aug_name]

        self.learning_steps += 1

        if self.use_per:
            batch, weights = self.memory.sample_rad(augs_funcs)
        else:
            batch = self.memory.sample_rad(augs_funcs)
            # Set priority weights to 1 when we don't use PER.
            weights = 1.

        q1_loss, q2_loss, errors, mean_q1, mean_q2 = \
            self.calc_critic_loss(batch, weights)
        policy_loss, entropies = self.calc_policy_loss(batch, weights)
        entropy_loss = self.calc_entropy_loss(entropies, weights)

        update_params(self.q1_optim, q1_loss)
        update_params(self.q2_optim, q2_loss)
        update_params(self.policy_optim, policy_loss)
        update_params(self.alpha_optim, entropy_loss)

        self.alpha = self.log_alpha.exp()

        if self.use_per:
            self.memory.update_priority(errors)

        if self.learning_steps % self.log_interval == 0:
            self.writer.add_scalar('loss/Q1',
                                   q1_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('loss/Q2',
                                   q2_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('loss/policy',
                                   policy_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('loss/alpha',
                                   entropy_loss.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('stats/alpha',
                                   self.alpha.detach().item(),
                                   self.learning_steps)
            self.writer.add_scalar('stats/mean_Q1', mean_q1,
                                   self.learning_steps)
            self.writer.add_scalar('stats/mean_Q2', mean_q2,
                                   self.learning_steps)
            self.writer.add_scalar('stats/entropy',
                                   entropies.detach().mean().item(),
                                   self.learning_steps)