Example #1
0
    def __init__(self,
                 envspec,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 use_target_action_noise=False,
                 gaussian_noise_sigma=0.2,
                 gaussian_noise_bound=0.2,
                 discrete_tau=1.0,
                 network_settings={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        super().__init__(envspec=envspec, **kwargs)
        self.discrete_tau = discrete_tau
        self.use_target_action_noise = use_target_action_noise
        self.gaussian_noise_sigma = gaussian_noise_sigma
        self.gaussian_noise_bound = gaussian_noise_bound

        if self.is_continuous:
            self.target_noised_action = ClippedNormalNoisedAction(
                sigma=self.gaussian_noise_sigma,
                noise_bound=self.gaussian_noise_bound)
            self.noised_action = OrnsteinUhlenbeckNoisedAction(sigma=0.2)
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_DPG,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    network_settings=network_settings['actor_continuous']),
                value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                value_net_kwargs=dict(action_dim=self.a_dim,
                                      network_settings=network_settings['q']))
        else:
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_DCT,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    network_settings=network_settings['actor_discrete']),
                value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                value_net_kwargs=dict(action_dim=self.a_dim,
                                      network_settings=network_settings['q']))

        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])

        self._worker_params_dict.update(self.net._policy_models)

        self._all_params_dict.update(self.net._all_models)
        self._all_params_dict.update(optimizer_actor=self.optimizer_actor,
                                     optimizer_critic=self.optimizer_critic)
        self._model_post_process()
Example #2
0
File: a2c.py Project: zhijie-ai/RLs
    def __init__(
            self,
            envspec,
            epoch=5,
            beta=1.0e-3,
            actor_lr=5.0e-4,
            critic_lr=1.0e-3,
            condition_sigma: bool = False,
            network_settings={
                'actor_continuous': [32, 32],
                'actor_discrete': [32, 32],
                'critic': [32, 32]
            },
            **kwargs):
        super().__init__(envspec=envspec, **kwargs)
        self.beta = beta
        self.epoch = epoch

        if self.is_continuous:
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_MU_LOGSTD,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    condition_sigma=condition_sigma,
                    network_settings=network_settings['actor_continuous']),
                value_net_type=OutputNetworkType.CRITIC_VALUE,
                value_net_kwargs=dict(
                    network_settings=network_settings['critic']))
        else:
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_DCT,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    network_settings=network_settings['actor_discrete']),
                value_net_type=OutputNetworkType.CRITIC_VALUE,
                value_net_kwargs=dict(
                    network_settings=network_settings['critic']))

        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])

        self.initialize_data_buffer(
            sample_data_type=A2C_Train_BatchExperiences)

        self._worker_params_dict.update(self.net._policy_models)

        self._all_params_dict.update(self.net._all_models)
        self._all_params_dict.update(optimizer_actor=self.optimizer_actor,
                                     optimizer_critic=self.optimizer_critic)
        self._model_post_process()
Example #3
0
    def __init__(self,
                 envspec,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 discrete_tau=1.0,
                 network_settings={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        super().__init__(envspec=envspec, **kwargs)
        self.discrete_tau = discrete_tau

        if self.is_continuous:
            # self.action_noise = NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim))
            self.action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim))
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_DPG,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    network_settings=network_settings['actor_continuous']),
                value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                value_net_kwargs=dict(action_dim=self.a_dim,
                                      network_settings=network_settings['q']))
        else:
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_DCT,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    network_settings=network_settings['actor_discrete']),
                value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                value_net_kwargs=dict(action_dim=self.a_dim,
                                      network_settings=network_settings['q']))

        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])

        self._worker_params_dict.update(self.net._policy_models)

        self._all_params_dict.update(self.net._all_models)
        self._all_params_dict.update(optimizer_actor=self.optimizer_actor,
                                     optimizer_critic=self.optimizer_critic)
        self._model_post_process()
Example #4
0
 def _create_net(name, representation_net=None):
     return ACNetwork(
         name=name,
         representation_net=representation_net,
         policy_net_type=OutputNetworkType.ACTOR_DCT,
         policy_net_kwargs=dict(
             output_shape=self.a_dim,
             network_settings=network_settings['actor_discrete']),
         value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
         value_net_kwargs=dict(
             action_dim=self.a_dim,
             network_settings=network_settings['q']))
Example #5
0
File: ppo.py Project: zhijie-ai/RLs
    def __init__(
            self,
            envspec,
            policy_epoch: int = 4,
            value_epoch: int = 4,
            ent_coef: float = 1.0e-2,
            vf_coef: float = 0.5,
            lr: float = 5.0e-4,
            lambda_: float = 0.95,
            epsilon: float = 0.2,
            use_duel_clip: bool = False,
            duel_epsilon: float = 0.,
            use_vclip: bool = False,
            value_epsilon: float = 0.2,
            share_net: bool = True,
            actor_lr: float = 3e-4,
            critic_lr: float = 1e-3,
            max_grad_norm: float = 0.5,
            condition_sigma: bool = False,
            kl_reverse: bool = False,
            kl_target: float = 0.02,
            kl_target_cutoff: float = 2,
            kl_target_earlystop: float = 4,
            kl_beta: List[float] = [0.7, 1.3],
            kl_alpha: float = 1.5,
            kl_coef: float = 1.0,
            extra_coef: float = 1000.0,
            use_kl_loss: bool = False,
            use_extra_loss: bool = False,
            use_early_stop: bool = False,
            network_settings: Dict = {
                'share': {
                    'continuous': {
                        'share': [32, 32],
                        'mu': [32, 32],
                        'v': [32, 32]
                    },
                    'discrete': {
                        'share': [32, 32],
                        'logits': [32, 32],
                        'v': [32, 32]
                    }
                },
                'actor_continuous': [32, 32],
                'actor_discrete': [32, 32],
                'critic': [32, 32]
            },
            **kwargs):
        super().__init__(envspec=envspec, **kwargs)
        self.ent_coef = ent_coef
        self.policy_epoch = policy_epoch
        self.value_epoch = value_epoch
        self.lambda_ = lambda_
        assert 0.0 <= lambda_ <= 1.0, "GAE lambda should be in [0, 1]."
        self.epsilon = epsilon
        self.use_vclip = use_vclip
        self.value_epsilon = value_epsilon
        self.share_net = share_net
        self.kl_reverse = kl_reverse
        self.kl_target = kl_target
        self.kl_alpha = kl_alpha
        self.kl_coef = tf.constant(kl_coef, dtype=tf.float32)
        self.extra_coef = extra_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm

        self.use_duel_clip = use_duel_clip
        self.duel_epsilon = duel_epsilon
        if self.use_duel_clip:
            assert -self.epsilon < self.duel_epsilon < self.epsilon, "duel_epsilon should be set in the range of (-epsilon, epsilon)."

        self.kl_cutoff = kl_target * kl_target_cutoff
        self.kl_stop = kl_target * kl_target_earlystop
        self.kl_low = kl_target * kl_beta[0]
        self.kl_high = kl_target * kl_beta[-1]

        self.use_kl_loss = use_kl_loss
        self.use_extra_loss = use_extra_loss
        self.use_early_stop = use_early_stop

        if self.share_net:
            if self.is_continuous:
                self.net = ValueNetwork(
                    name='net',
                    representation_net=self._representation_net,
                    value_net_type=OutputNetworkType.ACTOR_CRITIC_VALUE_CTS,
                    value_net_kwargs=dict(
                        output_shape=self.a_dim,
                        condition_sigma=condition_sigma,
                        network_settings=network_settings['share']
                        ['continuous']))
            else:
                self.net = ValueNetwork(
                    name='net',
                    representation_net=self._representation_net,
                    value_net_type=OutputNetworkType.ACTOR_CRITIC_VALUE_DCT,
                    value_net_kwargs=dict(
                        output_shape=self.a_dim,
                        network_settings=network_settings['share']
                        ['discrete']))
            self.lr = self.init_lr(lr)
            if self.max_grad_norm is not None:
                self.optimizer = self.init_optimizer(
                    self.lr, clipnorm=self.max_grad_norm)
            else:
                self.optimizer = self.init_optimizer(self.lr)
            self._all_params_dict.update(optimizer=self.optimizer)
        else:
            if self.is_continuous:
                self.net = ACNetwork(
                    name='net',
                    representation_net=self._representation_net,
                    policy_net_type=OutputNetworkType.ACTOR_MU_LOGSTD,
                    policy_net_kwargs=dict(
                        output_shape=self.a_dim,
                        condition_sigma=condition_sigma,
                        network_settings=network_settings['actor_continuous']),
                    value_net_type=OutputNetworkType.CRITIC_VALUE,
                    value_net_kwargs=dict(
                        network_settings=network_settings['critic']))
            else:
                self.net = ACNetwork(
                    name='net',
                    representation_net=self._representation_net,
                    policy_net_type=OutputNetworkType.ACTOR_DCT,
                    policy_net_kwargs=dict(
                        output_shape=self.a_dim,
                        network_settings=network_settings['actor_discrete']),
                    value_net_type=OutputNetworkType.CRITIC_VALUE,
                    value_net_kwargs=dict(
                        network_settings=network_settings['critic']))
            self.actor_lr, self.critic_lr = map(self.init_lr,
                                                [actor_lr, critic_lr])
            if self.max_grad_norm is not None:
                self.optimizer_actor = self.init_optimizer(
                    self.actor_lr, clipnorm=self.max_grad_norm)
                self.optimizer_critic = self.init_optimizer(
                    self.critic_lr, clipnorm=self.max_grad_norm)
            else:
                self.optimizer_actor, self.optimizer_critic = map(
                    self.init_optimizer, [self.actor_lr, self.critic_lr])

            self._all_params_dict.update(
                optimizer_actor=self.optimizer_actor,
                optimizer_critic=self.optimizer_critic)

        self._worker_params_dict.update(self.net._policy_models)

        self._all_params_dict.update(self.net._all_models)

        self.initialize_data_buffer(
            store_data_type=PPO_Store_BatchExperiences,
            sample_data_type=PPO_Train_BatchExperiences)
        self._model_post_process()
Example #6
0
File: ppo.py Project: zhijie-ai/RLs
class PPO(On_Policy):
    '''
    Proximal Policy Optimization, https://arxiv.org/abs/1707.06347
    Emergence of Locomotion Behaviours in Rich Environments, http://arxiv.org/abs/1707.02286, DPPO
    '''
    def __init__(
            self,
            envspec,
            policy_epoch: int = 4,
            value_epoch: int = 4,
            ent_coef: float = 1.0e-2,
            vf_coef: float = 0.5,
            lr: float = 5.0e-4,
            lambda_: float = 0.95,
            epsilon: float = 0.2,
            use_duel_clip: bool = False,
            duel_epsilon: float = 0.,
            use_vclip: bool = False,
            value_epsilon: float = 0.2,
            share_net: bool = True,
            actor_lr: float = 3e-4,
            critic_lr: float = 1e-3,
            max_grad_norm: float = 0.5,
            condition_sigma: bool = False,
            kl_reverse: bool = False,
            kl_target: float = 0.02,
            kl_target_cutoff: float = 2,
            kl_target_earlystop: float = 4,
            kl_beta: List[float] = [0.7, 1.3],
            kl_alpha: float = 1.5,
            kl_coef: float = 1.0,
            extra_coef: float = 1000.0,
            use_kl_loss: bool = False,
            use_extra_loss: bool = False,
            use_early_stop: bool = False,
            network_settings: Dict = {
                'share': {
                    'continuous': {
                        'share': [32, 32],
                        'mu': [32, 32],
                        'v': [32, 32]
                    },
                    'discrete': {
                        'share': [32, 32],
                        'logits': [32, 32],
                        'v': [32, 32]
                    }
                },
                'actor_continuous': [32, 32],
                'actor_discrete': [32, 32],
                'critic': [32, 32]
            },
            **kwargs):
        super().__init__(envspec=envspec, **kwargs)
        self.ent_coef = ent_coef
        self.policy_epoch = policy_epoch
        self.value_epoch = value_epoch
        self.lambda_ = lambda_
        assert 0.0 <= lambda_ <= 1.0, "GAE lambda should be in [0, 1]."
        self.epsilon = epsilon
        self.use_vclip = use_vclip
        self.value_epsilon = value_epsilon
        self.share_net = share_net
        self.kl_reverse = kl_reverse
        self.kl_target = kl_target
        self.kl_alpha = kl_alpha
        self.kl_coef = tf.constant(kl_coef, dtype=tf.float32)
        self.extra_coef = extra_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm

        self.use_duel_clip = use_duel_clip
        self.duel_epsilon = duel_epsilon
        if self.use_duel_clip:
            assert -self.epsilon < self.duel_epsilon < self.epsilon, "duel_epsilon should be set in the range of (-epsilon, epsilon)."

        self.kl_cutoff = kl_target * kl_target_cutoff
        self.kl_stop = kl_target * kl_target_earlystop
        self.kl_low = kl_target * kl_beta[0]
        self.kl_high = kl_target * kl_beta[-1]

        self.use_kl_loss = use_kl_loss
        self.use_extra_loss = use_extra_loss
        self.use_early_stop = use_early_stop

        if self.share_net:
            if self.is_continuous:
                self.net = ValueNetwork(
                    name='net',
                    representation_net=self._representation_net,
                    value_net_type=OutputNetworkType.ACTOR_CRITIC_VALUE_CTS,
                    value_net_kwargs=dict(
                        output_shape=self.a_dim,
                        condition_sigma=condition_sigma,
                        network_settings=network_settings['share']
                        ['continuous']))
            else:
                self.net = ValueNetwork(
                    name='net',
                    representation_net=self._representation_net,
                    value_net_type=OutputNetworkType.ACTOR_CRITIC_VALUE_DCT,
                    value_net_kwargs=dict(
                        output_shape=self.a_dim,
                        network_settings=network_settings['share']
                        ['discrete']))
            self.lr = self.init_lr(lr)
            if self.max_grad_norm is not None:
                self.optimizer = self.init_optimizer(
                    self.lr, clipnorm=self.max_grad_norm)
            else:
                self.optimizer = self.init_optimizer(self.lr)
            self._all_params_dict.update(optimizer=self.optimizer)
        else:
            if self.is_continuous:
                self.net = ACNetwork(
                    name='net',
                    representation_net=self._representation_net,
                    policy_net_type=OutputNetworkType.ACTOR_MU_LOGSTD,
                    policy_net_kwargs=dict(
                        output_shape=self.a_dim,
                        condition_sigma=condition_sigma,
                        network_settings=network_settings['actor_continuous']),
                    value_net_type=OutputNetworkType.CRITIC_VALUE,
                    value_net_kwargs=dict(
                        network_settings=network_settings['critic']))
            else:
                self.net = ACNetwork(
                    name='net',
                    representation_net=self._representation_net,
                    policy_net_type=OutputNetworkType.ACTOR_DCT,
                    policy_net_kwargs=dict(
                        output_shape=self.a_dim,
                        network_settings=network_settings['actor_discrete']),
                    value_net_type=OutputNetworkType.CRITIC_VALUE,
                    value_net_kwargs=dict(
                        network_settings=network_settings['critic']))
            self.actor_lr, self.critic_lr = map(self.init_lr,
                                                [actor_lr, critic_lr])
            if self.max_grad_norm is not None:
                self.optimizer_actor = self.init_optimizer(
                    self.actor_lr, clipnorm=self.max_grad_norm)
                self.optimizer_critic = self.init_optimizer(
                    self.critic_lr, clipnorm=self.max_grad_norm)
            else:
                self.optimizer_actor, self.optimizer_critic = map(
                    self.init_optimizer, [self.actor_lr, self.critic_lr])

            self._all_params_dict.update(
                optimizer_actor=self.optimizer_actor,
                optimizer_critic=self.optimizer_critic)

        self._worker_params_dict.update(self.net._policy_models)

        self._all_params_dict.update(self.net._all_models)

        self.initialize_data_buffer(
            store_data_type=PPO_Store_BatchExperiences,
            sample_data_type=PPO_Train_BatchExperiences)
        self._model_post_process()

    def choose_action(self, obs, evaluation: bool = False) -> np.ndarray:
        a, value, log_prob, self.next_cell_state = self._get_action(
            obs, self.cell_state)
        a = a.numpy()
        self._value = value.numpy()
        self._log_prob = log_prob.numpy() + 1e-10
        return a

    @tf.function
    def _get_action(self, obs, cell_state):
        with tf.device(self.device):
            feat, cell_state = self._representation_net(obs,
                                                        cell_state=cell_state)
            if self.is_continuous:
                if self.share_net:
                    mu, log_std, value = self.net.value_net(feat)
                else:
                    mu, log_std = self.net.policy_net(feat)
                    value = self.net.value_net(feat)
                sample_op, _ = gaussian_clip_rsample(mu, log_std)
                log_prob = gaussian_likelihood_sum(sample_op, mu, log_std)
            else:
                if self.share_net:
                    logits, value = self.net.value_net(feat)
                else:
                    logits = self.net.policy_net(feat)
                    value = self.net.value_net(feat)
                norm_dist = tfp.distributions.Categorical(
                    logits=tf.nn.log_softmax(logits))
                sample_op = norm_dist.sample()
                log_prob = norm_dist.log_prob(sample_op)
        return sample_op, value, log_prob, cell_state

    def store_data(self, exps: BatchExperiences) -> NoReturn:
        # self._running_average()
        self.data.add(
            PPO_Store_BatchExperiences(*exps, self._value, self._log_prob))
        if self.use_rnn:
            self.data.add_cell_state(
                tuple(cs.numpy() for cs in self.cell_state))
        self.cell_state = self.next_cell_state

    @tf.function
    def _get_value(self, obs, cell_state):
        with tf.device(self.device):
            feat, cell_state = self._representation_net(obs,
                                                        cell_state=cell_state)
            output = self.net.value_net(feat)
            if self.is_continuous:
                if self.share_net:
                    _, _, value = output
                else:
                    value = output
            else:
                if self.share_net:
                    _, value = output
                else:
                    value = output
            return value, cell_state

    def calculate_statistics(self) -> NoReturn:
        init_value, self.cell_state = self._get_value(
            self.data.last_data('obs_'), cell_state=self.cell_state)
        init_value = init_value.numpy()
        self.data.cal_dc_r(self.gamma, init_value)
        self.data.cal_td_error(self.gamma, init_value)
        self.data.cal_gae_adv(self.lambda_, self.gamma, normalize=True)

    # @show_graph(name='ppo_net')
    def learn(self, **kwargs) -> NoReturn:
        self.train_step = kwargs.get('train_step')

        def _train(data, cell_state):
            early_step = 0
            if self.share_net:
                for i in range(self.policy_epoch):
                    actor_loss, critic_loss, entropy, kl = self.train_share(
                        data, cell_state, self.kl_coef)
                    if self.use_early_stop and kl > self.kl_stop:
                        early_step = i
                        break
            else:
                for i in range(self.policy_epoch):
                    actor_loss, entropy, kl = self.train_actor(
                        data, cell_state, self.kl_coef)
                    if self.use_early_stop and kl > self.kl_stop:
                        early_step = i
                        break

                for _ in range(self.value_epoch):
                    critic_loss = self.train_critic(data, cell_state)

            summaries = dict([['LOSS/actor_loss', actor_loss],
                              ['LOSS/critic_loss', critic_loss],
                              ['Statistics/kl', kl],
                              ['Statistics/entropy', entropy]])

            if self.use_early_stop:
                summaries.update(dict([['Statistics/early_step', early_step]]))

            if self.use_kl_loss:
                # ref: https://github.com/joschu/modular_rl/blob/6970cde3da265cf2a98537250fea5e0c0d9a7639/modular_rl/ppo.py#L93
                if kl > self.kl_high:
                    self.kl_coef *= self.kl_alpha
                elif kl < self.kl_low:
                    self.kl_coef /= self.kl_alpha

                summaries.update(dict([['Statistics/kl_coef', self.kl_coef]]))
            return summaries

        if self.share_net:
            summary_dict = dict(
                [['LEARNING_RATE/lr',
                  self.lr(self.train_step)]])
        else:
            summary_dict = dict(
                [['LEARNING_RATE/actor_lr',
                  self.actor_lr(self.train_step)],
                 ['LEARNING_RATE/critic_lr',
                  self.critic_lr(self.train_step)]])

        self._learn(
            function_dict={
                'calculate_statistics': self.calculate_statistics,
                'train_function': _train,
                'summary_dict': summary_dict,
                'train_data_type': PPO_Train_BatchExperiences
            })

    @tf.function
    def train_share(self, BATCH, cell_state, kl_coef):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                output, cell_state = self.net(BATCH.obs, cell_state=cell_state)
                if self.is_continuous:
                    mu, log_std, value = output
                    new_log_prob = gaussian_likelihood_sum(
                        BATCH.action, mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits, value = output
                    logp_all = tf.nn.log_softmax(logits)
                    new_log_prob = tf.reduce_sum(BATCH.action * logp_all,
                                                 axis=1,
                                                 keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                ratio = tf.exp(new_log_prob - BATCH.log_prob)
                surrogate = ratio * BATCH.gae_adv
                clipped_surrogate = tf.minimum(
                    surrogate,
                    tf.clip_by_value(ratio, 1.0 - self.epsilon,
                                     1.0 + self.epsilon) * BATCH.gae_adv)
                # ref: https://github.com/thu-ml/tianshou/blob/c97aa4065ee8464bd5897bb86f1f81abd8e2cff9/tianshou/policy/modelfree/ppo.py#L159
                if self.use_duel_clip:
                    clipped_surrogate = tf.maximum(clipped_surrogate,
                                                   (1.0 + self.duel_epsilon) *
                                                   BATCH.gae_adv)
                actor_loss = -(tf.reduce_mean(clipped_surrogate) +
                               self.ent_coef * entropy)

                # ref: https://github.com/joschu/modular_rl/blob/6970cde3da265cf2a98537250fea5e0c0d9a7639/modular_rl/ppo.py#L40
                # ref: https://github.com/hill-a/stable-baselines/blob/b3f414f4f2900403107357a2206f80868af16da3/stable_baselines/ppo2/ppo2.py#L185
                if self.kl_reverse:
                    kl = .5 * tf.reduce_mean(
                        tf.square(new_log_prob - BATCH.log_prob))
                else:
                    kl = .5 * tf.reduce_mean(
                        tf.square(BATCH.log_prob - new_log_prob)
                    )  # a sample estimate for KL-divergence, easy to compute

                td_error = BATCH.discounted_reward - value
                if self.use_vclip:
                    # ref: https://github.com/llSourcell/OpenAI_Five_vs_Dota2_Explained/blob/c5def7e57aa70785c2394ea2eeb3e5f66ad59a53/train.py#L154
                    # ref: https://github.com/hill-a/stable-baselines/blob/b3f414f4f2900403107357a2206f80868af16da3/stable_baselines/ppo2/ppo2.py#L172
                    value_clip = BATCH.value + tf.clip_by_value(
                        value - BATCH.value, -self.value_epsilon,
                        self.value_epsilon)
                    td_error_clip = BATCH.discounted_reward - value_clip
                    td_square = tf.maximum(tf.square(td_error),
                                           tf.square(td_error_clip))
                else:
                    td_square = tf.square(td_error)

                if self.use_kl_loss:
                    kl_loss = kl_coef * kl
                    actor_loss += kl_loss

                if self.use_extra_loss:
                    extra_loss = self.extra_coef * tf.square(
                        tf.maximum(0., kl - self.kl_cutoff))
                    actor_loss += extra_loss
                value_loss = 0.5 * tf.reduce_mean(td_square)
                loss = actor_loss + self.vf_coef * value_loss
            loss_grads = tape.gradient(loss, self.net.trainable_variables)
            self.optimizer.apply_gradients(
                zip(loss_grads, self.net.trainable_variables))
            self.global_step.assign_add(1)
            return actor_loss, value_loss, entropy, kl

    @tf.function
    def train_actor(self, BATCH, cell_state, kl_coef):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                output, _ = self.net(BATCH.obs, cell_state=cell_state)
                if self.is_continuous:
                    mu, log_std = output
                    new_log_prob = gaussian_likelihood_sum(
                        BATCH.action, mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = output
                    logp_all = tf.nn.log_softmax(logits)
                    new_log_prob = tf.reduce_sum(BATCH.action * logp_all,
                                                 axis=1,
                                                 keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                ratio = tf.exp(new_log_prob - BATCH.log_prob)
                kl = tf.reduce_mean(BATCH.log_prob - new_log_prob)
                surrogate = ratio * BATCH.gae_adv
                clipped_surrogate = tf.minimum(
                    surrogate,
                    tf.where(BATCH.gae_adv > 0,
                             (1 + self.epsilon) * BATCH.gae_adv,
                             (1 - self.epsilon) * BATCH.gae_adv))
                if self.use_duel_clip:
                    clipped_surrogate = tf.maximum(clipped_surrogate,
                                                   (1.0 + self.duel_epsilon) *
                                                   BATCH.gae_adv)

                actor_loss = -(tf.reduce_mean(clipped_surrogate) +
                               self.ent_coef * entropy)

                if self.use_kl_loss:
                    kl_loss = kl_coef * kl
                    actor_loss += kl_loss
                if self.use_extra_loss:
                    extra_loss = self.extra_coef * tf.square(
                        tf.maximum(0., kl - self.kl_cutoff))
                    actor_loss += extra_loss

            actor_grads = tape.gradient(actor_loss,
                                        self.net.actor_trainable_variables)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.net.actor_trainable_variables))
            self.global_step.assign_add(1)
            return actor_loss, entropy, kl

    @tf.function
    def train_critic(self, BATCH, cell_state):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, _ = self._representation_net(BATCH.obs,
                                                   cell_state=cell_state)
                value = self.net.value_net(feat)

                td_error = BATCH.discounted_reward - value
                if self.use_vclip:
                    value_clip = BATCH.value + tf.clip_by_value(
                        value - BATCH.value, -self.value_epsilon,
                        self.value_epsilon)
                    td_error_clip = BATCH.discounted_reward - value_clip
                    td_square = tf.maximum(tf.square(td_error),
                                           tf.square(td_error_clip))
                else:
                    td_square = tf.square(td_error)

                value_loss = 0.5 * tf.reduce_mean(td_square)
            critic_grads = tape.gradient(value_loss,
                                         self.net.critic_trainable_variables)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.net.critic_trainable_variables))
            return value_loss
Example #7
0
class AC(Off_Policy):
    # off-policy actor-critic
    def __init__(
            self,
            envspec,
            actor_lr=5.0e-4,
            critic_lr=1.0e-3,
            condition_sigma: bool = False,
            network_settings={
                'actor_continuous': [32, 32],
                'actor_discrete': [32, 32],
                'critic': [32, 32]
            },
            **kwargs):
        super().__init__(envspec=envspec, **kwargs)

        if self.is_continuous:
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_MU_LOGSTD,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    condition_sigma=condition_sigma,
                    network_settings=network_settings['actor_continuous']),
                value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                value_net_kwargs=dict(
                    action_dim=self.a_dim,
                    network_settings=network_settings['critic']))
        else:
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_DCT,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    network_settings=network_settings['actor_discrete']),
                value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                value_net_kwargs=dict(
                    action_dim=self.a_dim,
                    network_settings=network_settings['critic']))
        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])

        self._worker_params_dict.update(self.net._policy_models)

        self._all_params_dict.update(self.net._all_models)
        self._all_params_dict.update(optimizer_actor=self.optimizer_actor,
                                     optimizer_critic=self.optimizer_critic)
        self._model_post_process()

    def choose_action(self, s, visual_s, evaluation=False):
        a, _lp, self.cell_state = self._get_action(s, visual_s,
                                                   self.cell_state)
        a = a.numpy()
        self._log_prob = _lp.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            output, cell_state = self.net(s, visual_s, cell_state=cell_state)
            if self.is_continuous:
                mu, log_std = output
                sample_op, _ = gaussian_clip_rsample(mu, log_std)
                log_prob = gaussian_likelihood_sum(sample_op, mu, log_std)
            else:
                logits = output
                norm_dist = tfp.distributions.Categorical(
                    logits=tf.nn.log_softmax(logits))
                sample_op = norm_dist.sample()
                log_prob = norm_dist.log_prob(sample_op)
        return sample_op, log_prob, cell_state

    def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(
            a, np.ndarray), "store_data need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "store_data need reward type is np.ndarray"
        assert isinstance(
            done, np.ndarray), "store_data need done type is np.ndarray"
        self._running_average(s)
        old_log_prob = self._log_prob
        self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_,
                      done[:, np.newaxis], old_log_prob)

    def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(
            a, np.ndarray), "store_data need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "store_data need reward type is np.ndarray"
        assert isinstance(
            done, np.ndarray), "store_data need done type is np.ndarray"
        self._running_average(s)
        old_log_prob = np.ones_like(r)
        self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_,
                      done[:, np.newaxis], old_log_prob[:, np.newaxis])

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')
        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'summary_dict':
                    dict([[
                        'LEARNING_RATE/actor_lr',
                        self.actor_lr(self.train_step)
                    ],
                          [
                              'LEARNING_RATE/critic_lr',
                              self.critic_lr(self.train_step)
                          ]]),
                    'sample_data_list': [
                        's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done',
                        'old_log_prob'
                    ],
                    'train_data_list':
                    ['ss', 'vvss', 'a', 'r', 'done', 'old_log_prob']
                })

    @tf.function(experimental_relax_shapes=True)
    def _train(self, memories, isw, cell_state):
        ss, vvss, a, r, done, old_log_prob = memories
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                (feat,
                 feat_), _ = self._representation_net(ss,
                                                      vvss,
                                                      cell_state=cell_state,
                                                      need_split=True)
                if self.is_continuous:
                    mu, log_std = self.net.policy_net(feat)
                    log_prob = gaussian_likelihood_sum(a, mu, log_std)
                    entropy = gaussian_entropy(log_std)

                    next_mu, _ = self.net.policy_net(feat_)
                    max_q_next = tf.stop_gradient(
                        self.net.value_net(feat_, next_mu))
                else:
                    logits = self.net.policy_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    log_prob = tf.reduce_sum(tf.multiply(logp_all, a),
                                             axis=1,
                                             keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))

                    logits = self.net.policy_net(feat_)
                    max_a = tf.argmax(logits, axis=1)
                    max_a_one_hot = tf.one_hot(max_a, self.a_dim)
                    max_q_next = tf.stop_gradient(
                        self.net.value_net(feat_, max_a_one_hot))
                q = self.net.value_net(feat, a)
                ratio = tf.stop_gradient(tf.exp(log_prob - old_log_prob))
                q_value = tf.stop_gradient(q)
                td_error = q - (r + self.gamma * (1 - done) * max_q_next)
                critic_loss = tf.reduce_mean(tf.square(td_error) * isw)
                actor_loss = -tf.reduce_mean(ratio * log_prob * q_value)
            critic_grads = tape.gradient(critic_loss,
                                         self.net.critic_trainable_variables)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.net.critic_trainable_variables))
            actor_grads = tape.gradient(actor_loss,
                                        self.net.actor_trainable_variables)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.net.actor_trainable_variables))
            self.global_step.assign_add(1)
            return td_error, dict([['LOSS/actor_loss', actor_loss],
                                   ['LOSS/critic_loss', critic_loss],
                                   ['Statistics/q_max',
                                    tf.reduce_max(q)],
                                   ['Statistics/q_min',
                                    tf.reduce_min(q)],
                                   ['Statistics/q_mean',
                                    tf.reduce_mean(q)],
                                   ['Statistics/ratio',
                                    tf.reduce_mean(ratio)],
                                   ['Statistics/entropy', entropy]])
Example #8
0
File: a2c.py Project: zhijie-ai/RLs
class A2C(On_Policy):
    def __init__(
            self,
            envspec,
            epoch=5,
            beta=1.0e-3,
            actor_lr=5.0e-4,
            critic_lr=1.0e-3,
            condition_sigma: bool = False,
            network_settings={
                'actor_continuous': [32, 32],
                'actor_discrete': [32, 32],
                'critic': [32, 32]
            },
            **kwargs):
        super().__init__(envspec=envspec, **kwargs)
        self.beta = beta
        self.epoch = epoch

        if self.is_continuous:
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_MU_LOGSTD,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    condition_sigma=condition_sigma,
                    network_settings=network_settings['actor_continuous']),
                value_net_type=OutputNetworkType.CRITIC_VALUE,
                value_net_kwargs=dict(
                    network_settings=network_settings['critic']))
        else:
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_DCT,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    network_settings=network_settings['actor_discrete']),
                value_net_type=OutputNetworkType.CRITIC_VALUE,
                value_net_kwargs=dict(
                    network_settings=network_settings['critic']))

        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])

        self.initialize_data_buffer(
            sample_data_type=A2C_Train_BatchExperiences)

        self._worker_params_dict.update(self.net._policy_models)

        self._all_params_dict.update(self.net._all_models)
        self._all_params_dict.update(optimizer_actor=self.optimizer_actor,
                                     optimizer_critic=self.optimizer_critic)
        self._model_post_process()

    def choose_action(self, obs, evaluation=False):
        a, self.next_cell_state = self._get_action(obs, self.cell_state)
        a = a.numpy()
        return a

    @tf.function
    def _get_action(self, obs, cell_state):
        with tf.device(self.device):
            output, cell_state = self.net(obs, cell_state=cell_state)
            if self.is_continuous:
                mu, log_std = output
                sample_op, _ = gaussian_clip_rsample(mu, log_std)
            else:
                logits = output
                norm_dist = tfp.distributions.Categorical(
                    logits=tf.nn.log_softmax(logits))
                sample_op = norm_dist.sample()
        return sample_op, cell_state

    @tf.function
    def _get_value(self, obs, cell_state):
        with tf.device(self.device):
            feat, cell_state = self._representation_net(obs,
                                                        cell_state=cell_state)
            value = self.net.value_net(feat)
            return value, cell_state

    def calculate_statistics(self):
        init_value, self.cell_state = self._get_value(
            self.data.last_data('obs_'), cell_state=self.cell_state)
        self.data.cal_dc_r(self.gamma, init_value.numpy())

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _train(data, cell_state):
            for _ in range(self.epoch):
                actor_loss, critic_loss, entropy = self.train(data, cell_state)

            summaries = dict([
                ['LOSS/actor_loss', actor_loss],
                ['LOSS/critic_loss', critic_loss],
                ['Statistics/entropy', entropy],
            ])
            return summaries

        self._learn(
            function_dict={
                'calculate_statistics':
                self.calculate_statistics,
                'train_function':
                _train,
                'summary_dict':
                dict([[
                    'LEARNING_RATE/actor_lr',
                    self.actor_lr(self.train_step)
                ], [
                    'LEARNING_RATE/critic_lr',
                    self.critic_lr(self.train_step)
                ]])
            })

    @tf.function
    def train(self, BATCH, cell_state):
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                feat, _ = self._representation_net(BATCH.obs,
                                                   cell_state=cell_state)
                if self.is_continuous:
                    mu, log_std = self.net.policy_net(feat)
                    log_act_prob = gaussian_likelihood_sum(
                        BATCH.action, mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = self.net.policy_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    log_act_prob = tf.reduce_sum(BATCH.action * logp_all,
                                                 axis=1,
                                                 keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                v = self.net.value_net(feat)
                advantage = tf.stop_gradient(BATCH.discounted_reward - v)
                td_error = BATCH.discounted_reward - v
                critic_loss = tf.reduce_mean(tf.square(td_error))
                actor_loss = -(tf.reduce_mean(log_act_prob * advantage) +
                               self.beta * entropy)
            critic_grads = tape.gradient(critic_loss,
                                         self.net.critic_trainable_variables)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.net.critic_trainable_variables))
            if self.is_continuous:
                actor_grads = tape.gradient(actor_loss,
                                            self.net.actor_trainable_variables)
                self.optimizer_actor.apply_gradients(
                    zip(actor_grads, self.net.actor_trainable_variables))
            else:
                actor_grads = tape.gradient(actor_loss,
                                            self.net.actor_trainable_variables)
                self.optimizer_actor.apply_gradients(
                    zip(actor_grads, self.net.actor_trainable_variables))
            self.global_step.assign_add(1)
            return actor_loss, critic_loss, entropy
Example #9
0
class DPG(Off_Policy):
    '''
    Deterministic Policy Gradient, https://hal.inria.fr/file/index/docid/938992/filename/dpg-icml2014.pdf
    '''

    # off-policy DPG

    def __init__(self,
                 envspec,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 use_target_action_noise=False,
                 gaussian_noise_sigma=0.2,
                 gaussian_noise_bound=0.2,
                 discrete_tau=1.0,
                 network_settings={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        super().__init__(envspec=envspec, **kwargs)
        self.discrete_tau = discrete_tau
        self.use_target_action_noise = use_target_action_noise
        self.gaussian_noise_sigma = gaussian_noise_sigma
        self.gaussian_noise_bound = gaussian_noise_bound

        if self.is_continuous:
            self.target_noised_action = ClippedNormalNoisedAction(
                sigma=self.gaussian_noise_sigma,
                noise_bound=self.gaussian_noise_bound)
            self.noised_action = OrnsteinUhlenbeckNoisedAction(sigma=0.2)
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_DPG,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    network_settings=network_settings['actor_continuous']),
                value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                value_net_kwargs=dict(action_dim=self.a_dim,
                                      network_settings=network_settings['q']))
        else:
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_DCT,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    network_settings=network_settings['actor_discrete']),
                value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                value_net_kwargs=dict(action_dim=self.a_dim,
                                      network_settings=network_settings['q']))

        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])

        self._worker_params_dict.update(self.net._policy_models)

        self._all_params_dict.update(self.net._all_models)
        self._all_params_dict.update(optimizer_actor=self.optimizer_actor,
                                     optimizer_critic=self.optimizer_critic)
        self._model_post_process()

    def reset(self):
        super().reset()
        if self.is_continuous:
            self.noised_action.reset()

    def choose_action(self, obs, evaluation=False):
        mu, pi, self.cell_state = self._get_action(obs, self.cell_state)
        a = mu.numpy() if evaluation else pi.numpy()
        return a

    @tf.function
    def _get_action(self, obs, cell_state):
        with tf.device(self.device):
            output, cell_state = self.net(obs, cell_state=cell_state)
            if self.is_continuous:
                mu = output
                pi = self.noised_action(mu)
            else:
                logits = output
                mu = tf.argmax(logits, axis=1)
                cate_dist = tfp.distributions.Categorical(
                    logits=tf.nn.log_softmax(logits))
                pi = cate_dist.sample()
            return mu, pi, cell_state

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')
        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'summary_dict':
                    dict([[
                        'LEARNING_RATE/actor_lr',
                        self.actor_lr(self.train_step)
                    ],
                          [
                              'LEARNING_RATE/critic_lr',
                              self.critic_lr(self.train_step)
                          ]]),
                    'use_stack':
                    True
                })

    @tf.function
    def _train(self, BATCH, isw, cell_state):
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                (feat,
                 feat_), _ = self._representation_net(BATCH.obs,
                                                      cell_state=cell_state,
                                                      need_split=True)
                if self.is_continuous:
                    action_target = self.ac_target_net.policy_net(feat_)
                    if self.use_target_action_noise:
                        action_target = self.target_noised_action(
                            action_target)
                    mu = self.net.policy_net(feat)
                else:
                    target_logits = self.net.policy_net(feat_)
                    target_cate_dist = tfp.distributions.Categorical(
                        logits=tf.nn.log_softmax(target_logits))
                    target_pi = target_cate_dist.sample()
                    target_log_pi = target_cate_dist.log_prob(target_pi)
                    action_target = tf.one_hot(target_pi,
                                               self.a_dim,
                                               dtype=tf.float32)

                    logits = self.net.policy_net(feat)
                    _pi = tf.nn.softmax(logits)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1),
                                                  self.a_dim,
                                                  dtype=tf.float32)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    mu = _pi_diff + _pi
                q_target = self.net.value_net(feat_, action_target)
                dc_r = tf.stop_gradient(BATCH.reward + self.gamma * q_target *
                                        (1 - BATCH.done))
                q = self.net.value_net(feat, BATCH.action)
                td_error = dc_r - q
                q_loss = 0.5 * tf.reduce_mean(tf.square(td_error) * isw)
                q_actor = self.net.value_net(feat, mu)
                actor_loss = -tf.reduce_mean(q_actor)
            q_grads = tape.gradient(q_loss,
                                    self.net.critic_trainable_variables)
            self.optimizer_critic.apply_gradients(
                zip(q_grads, self.net.critic_trainable_variables))
            actor_grads = tape.gradient(actor_loss,
                                        self.net.actor_trainable_variables)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.net.actor_trainable_variables))
            self.global_step.assign_add(1)
            return td_error, dict([['LOSS/actor_loss', actor_loss],
                                   ['LOSS/critic_loss', q_loss],
                                   ['Statistics/q_min',
                                    tf.reduce_min(q)],
                                   ['Statistics/q_mean',
                                    tf.reduce_mean(q)],
                                   ['Statistics/q_max',
                                    tf.reduce_max(q)]])
Example #10
0
    def __init__(
            self,
            envspec,
            beta=1.0e-3,
            lr=5.0e-4,
            delta=0.01,
            lambda_=0.95,
            cg_iters=10,
            train_v_iters=10,
            damping_coeff=0.1,
            backtrack_iters=10,
            backtrack_coeff=0.8,
            epsilon=0.2,
            critic_lr=1e-3,
            condition_sigma: bool = False,
            network_settings={
                'actor_continuous': [32, 32],
                'actor_discrete': [32, 32],
                'critic': [32, 32]
            },
            **kwargs):
        super().__init__(envspec=envspec, **kwargs)
        self.beta = beta
        self.delta = delta
        self.lambda_ = lambda_
        self.epsilon = epsilon
        self.cg_iters = cg_iters
        self.damping_coeff = damping_coeff
        self.backtrack_iters = backtrack_iters
        self.backtrack_coeff = backtrack_coeff
        self.train_v_iters = train_v_iters

        if self.is_continuous:
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_MU_LOGSTD,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    condition_sigma=condition_sigma,
                    network_settings=network_settings['actor_continuous']),
                value_net_type=OutputNetworkType.CRITIC_VALUE,
                value_net_kwargs=dict(
                    network_settings=network_settings['critic']))
        else:
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_DCT,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    network_settings=network_settings['actor_discrete']),
                value_net_type=OutputNetworkType.CRITIC_VALUE,
                value_net_kwargs=dict(
                    network_settings=network_settings['critic']))

        self.critic_lr = self.init_lr(critic_lr)
        self.optimizer_critic = self.init_optimizer(self.critic_lr)

        if self.is_continuous:
            data_name_list = [
                's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value',
                'log_prob', 'old_mu', 'old_log_std'
            ]
        else:
            data_name_list = [
                's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value',
                'log_prob', 'old_logp_all'
            ]
        self.initialize_data_buffer(data_name_list=data_name_list)

        self._worker_params_dict.update(self.net._policy_models)

        self._all_params_dict.update(self.net._all_models)
        self._all_params_dict.update(optimizer_critic=self.optimizer_critic)
        self._model_post_process()
Example #11
0
class TRPO(On_Policy):
    '''
    Trust Region Policy Optimization, https://arxiv.org/abs/1502.05477
    '''
    def __init__(
            self,
            envspec,
            beta=1.0e-3,
            lr=5.0e-4,
            delta=0.01,
            lambda_=0.95,
            cg_iters=10,
            train_v_iters=10,
            damping_coeff=0.1,
            backtrack_iters=10,
            backtrack_coeff=0.8,
            epsilon=0.2,
            critic_lr=1e-3,
            condition_sigma: bool = False,
            network_settings={
                'actor_continuous': [32, 32],
                'actor_discrete': [32, 32],
                'critic': [32, 32]
            },
            **kwargs):
        super().__init__(envspec=envspec, **kwargs)
        self.beta = beta
        self.delta = delta
        self.lambda_ = lambda_
        self.epsilon = epsilon
        self.cg_iters = cg_iters
        self.damping_coeff = damping_coeff
        self.backtrack_iters = backtrack_iters
        self.backtrack_coeff = backtrack_coeff
        self.train_v_iters = train_v_iters

        if self.is_continuous:
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_MU_LOGSTD,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    condition_sigma=condition_sigma,
                    network_settings=network_settings['actor_continuous']),
                value_net_type=OutputNetworkType.CRITIC_VALUE,
                value_net_kwargs=dict(
                    network_settings=network_settings['critic']))
        else:
            self.net = ACNetwork(
                name='net',
                representation_net=self._representation_net,
                policy_net_type=OutputNetworkType.ACTOR_DCT,
                policy_net_kwargs=dict(
                    output_shape=self.a_dim,
                    network_settings=network_settings['actor_discrete']),
                value_net_type=OutputNetworkType.CRITIC_VALUE,
                value_net_kwargs=dict(
                    network_settings=network_settings['critic']))

        self.critic_lr = self.init_lr(critic_lr)
        self.optimizer_critic = self.init_optimizer(self.critic_lr)

        if self.is_continuous:
            data_name_list = [
                's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value',
                'log_prob', 'old_mu', 'old_log_std'
            ]
        else:
            data_name_list = [
                's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value',
                'log_prob', 'old_logp_all'
            ]
        self.initialize_data_buffer(data_name_list=data_name_list)

        self._worker_params_dict.update(self.net._policy_models)

        self._all_params_dict.update(self.net._all_models)
        self._all_params_dict.update(optimizer_critic=self.optimizer_critic)
        self._model_post_process()

    def choose_action(self, s, visual_s, evaluation=False):
        a, _v, _lp, _morlpa, self.next_cell_state = self._get_action(
            s, visual_s, self.cell_state)
        a = a.numpy()
        self._value = np.squeeze(_v.numpy())
        self._log_prob = np.squeeze(_lp.numpy()) + 1e-10
        if self.is_continuous:
            self._mu = _morlpa[0].numpy()
            self._log_std = _morlpa[1].numpy()
        else:
            self._logp_all = _morlpa.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self._representation_net(s,
                                                        visual_s,
                                                        cell_state=cell_state)
            value = self.net.value_net(feat)
            output = self.net.policy_net(feat)
            if self.is_continuous:
                mu, log_std = output
                sample_op, _ = gaussian_clip_rsample(mu, log_std)
                log_prob = gaussian_likelihood_sum(sample_op, mu, log_std)
                return sample_op, value, log_prob, (mu, log_std), cell_state
            else:
                logits = output
                logp_all = tf.nn.log_softmax(logits)
                norm_dist = tfp.distributions.Categorical(logits=logp_all)
                sample_op = norm_dist.sample()
                log_prob = norm_dist.log_prob(sample_op)
                return sample_op, value, log_prob, logp_all, cell_state

    def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(
            a, np.ndarray), "store_data need action type is np.ndarray"
        assert isinstance(
            r, np.ndarray), "store_data need reward type is np.ndarray"
        assert isinstance(
            done, np.ndarray), "store_data need done type is np.ndarray"
        self._running_average(s)
        if self.is_continuous:
            data = (s, visual_s, a, r, s_, visual_s_, done, self._value,
                    self._log_prob, self._mu, self._log_std)
        else:
            data = (s, visual_s, a, r, s_, visual_s_, done, self._value,
                    self._log_prob, self._logp_all)
        if self.use_rnn:
            data += tuple(cs.numpy() for cs in self.cell_state)
        self.data.add(*data)
        self.cell_state = self.next_cell_state

    @tf.function
    def _get_value(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self._representation_net(s,
                                                        visual_s,
                                                        cell_state=cell_state)
            value = self.net.value_net(feat)
            return value, cell_state

    def calculate_statistics(self):
        init_value, self.cell_state = self._get_value(
            self.data.last_s(),
            self.data.last_visual_s(),
            cell_state=self.cell_state)
        init_value = np.squeeze(init_value.numpy())
        self.data.cal_dc_r(self.gamma, init_value)
        self.data.cal_td_error(self.gamma, init_value)
        self.data.cal_gae_adv(self.lambda_, self.gamma)

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _train(data):
            if self.is_continuous:
                s, visual_s, a, dc_r, old_log_prob, advantage, old_mu, old_log_std, cell_state = data
                Hx_args = (s, visual_s, old_mu, old_log_std, cell_state)
            else:
                s, visual_s, a, dc_r, old_log_prob, advantage, old_logp_all, cell_state = data
                Hx_args = (s, visual_s, old_logp_all, cell_state)
            actor_loss, entropy, gradients = self.train_actor(
                (s, visual_s, a, old_log_prob, advantage, cell_state))

            x = self.cg(self.Hx, gradients.numpy(), Hx_args)
            x = tf.convert_to_tensor(x)
            alpha = np.sqrt(2 * self.delta /
                            (np.dot(x, self.Hx(x, *Hx_args)) + 1e-8))
            for i in range(self.backtrack_iters):
                assign_params_from_flat(alpha * x * (self.backtrack_coeff**i),
                                        self.net.actor_trainable_variables)

            for _ in range(self.train_v_iters):
                critic_loss = self.train_critic(
                    (s, visual_s, dc_r, cell_state))

            summaries = dict([['LOSS/actor_loss', actor_loss],
                              ['LOSS/critic_loss', critic_loss],
                              ['Statistics/entropy', entropy]])
            return summaries

        if self.is_continuous:
            train_data_list = [
                's', 'visual_s', 'a', 'discounted_reward', 'log_prob',
                'gae_adv', 'old_mu', 'old_log_std'
            ]
        else:
            train_data_list = [
                's', 'visual_s', 'a', 'discounted_reward', 'log_prob',
                'gae_adv', 'old_logp_all'
            ]

        self._learn(
            function_dict={
                'calculate_statistics':
                self.calculate_statistics,
                'train_function':
                _train,
                'train_data_list':
                train_data_list,
                'summary_dict':
                dict([[
                    'LEARNING_RATE/critic_lr',
                    self.critic_lr(self.train_step)
                ]])
            })

    @tf.function(experimental_relax_shapes=True)
    def train_actor(self, memories):
        s, visual_s, a, old_log_prob, advantage, cell_state = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                output, _ = self.net(s, visual_s, cell_state=cell_state)
                if self.is_continuous:
                    mu, log_std = output
                    new_log_prob = gaussian_likelihood_sum(a, mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = output
                    logp_all = tf.nn.log_softmax(logits)
                    new_log_prob = tf.reduce_sum(a * logp_all,
                                                 axis=1,
                                                 keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                ratio = tf.exp(new_log_prob - old_log_prob)
                actor_loss = -tf.reduce_mean(ratio * advantage)
            actor_grads = tape.gradient(actor_loss,
                                        self.net.actor_trainable_variables)
            gradients = flat_concat(actor_grads)
            self.global_step.assign_add(1)
            return actor_loss, entropy, gradients

    @tf.function(experimental_relax_shapes=True)
    def Hx(self, x, *args):
        if self.is_continuous:
            s, visual_s, old_mu, old_log_std, cell_state = args
        else:
            s, visual_s, old_logp_all, cell_state = args
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                output, _ = self.net(s, visual_s, cell_state=cell_state)
                if self.is_continuous:
                    mu, log_std = output
                    var0, var1 = tf.exp(2 * log_std), tf.exp(2 * old_log_std)
                    pre_sum = 0.5 * (((old_mu - mu)**2 + var0) /
                                     (var1 + 1e-8) - 1) + old_log_std - log_std
                    all_kls = tf.reduce_sum(pre_sum, axis=1)
                    kl = tf.reduce_mean(all_kls)
                else:
                    logits = output
                    logp_all = tf.nn.log_softmax(logits)
                    all_kls = tf.reduce_sum(tf.exp(old_logp_all) *
                                            (old_logp_all - logp_all),
                                            axis=1)
                    kl = tf.reduce_mean(all_kls)

                g = flat_concat(
                    tape.gradient(kl, self.net.actor_trainable_variables))
                _g = tf.reduce_sum(g * x)
            hvp = flat_concat(
                tape.gradient(_g, self.net.actor_trainable_variables))
            if self.damping_coeff > 0:
                hvp += self.damping_coeff * x
            return hvp

    @tf.function(experimental_relax_shapes=True)
    def train_critic(self, memories):
        s, visual_s, dc_r, cell_state = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, _ = self._representation_net(s,
                                                   visual_s,
                                                   cell_state=cell_state)
                value = self.net.value_net(feat)
                td_error = dc_r - value
                value_loss = tf.reduce_mean(tf.square(td_error))
            critic_grads = tape.gradient(value_loss,
                                         self.net.critic_trainable_variables)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.net.critic_trainable_variables))
            return value_loss

    def cg(self, Ax, b, args):
        """
        Conjugate gradient algorithm
        (see https://en.wikipedia.org/wiki/Conjugate_gradient_method)
        """
        x = np.zeros_like(b)
        r = b.copy(
        )  # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
        p = r.copy()
        r_dot_old = np.dot(r, r)
        for _ in range(self.cg_iters):
            z = Ax(tf.convert_to_tensor(p), *args)
            alpha = r_dot_old / (np.dot(p, z) + 1e-8)
            x += alpha * p
            r -= alpha * z
            r_dot_new = np.dot(r, r)
            p = r + (r_dot_new / r_dot_old) * p
            r_dot_old = r_dot_new
        return x