Esempio n. 1
0
class MATD3(Policy):
    def __init__(self,
                 s_dim,
                 a_dim,
                 is_continuous,
                 ployak=0.995,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 n=1,
                 i=0,
                 hidden_units={
                     'actor': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        assert is_continuous, 'matd3 only support continuous action space'
        raise Exception('MA系列存在问题,还未修复')
        super().__init__(s_dim=s_dim,
                         visual_sources=0,
                         visual_resolution=0,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.n = n
        self.i = i
        self.ployak = ployak

        # self.action_noise = rls.NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim))
        self.action_noise = rls.OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim))

        def _actor_net():
            return rls.actor_dpg(self.s_dim, 0, self.a_dim,
                                 hidden_units['actor'])

        self.actor_net = _actor_net()
        self.actor_target_net = _actor_net()

        def _q_net():
            return rls.critic_q_one((self.s_dim) * self.n, 0,
                                    (self.a_dim) * self.n, hidden_units['q'])

        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)
        self.update_target_net_weights(
            self.actor_target_net.weights + self.critic_target_net.weights,
            self.actor_net.weights + self.critic_net.weights)
        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])

        self.model_recorder(
            dict(actor=self.actor_net,
                 critic_net=self.critic_net,
                 optimizer_critic=self.optimizer_critic,
                 optimizer_actor=self.optimizer_actor))

        self.recorder.logger.info(self.action_noise)

    def show_logo(self):
        self.recorder.logger.info('''
  xxxx    xxx         xx         xxxxxxxxx      xxxxxxx          xxxxx     
   xxx    xx         xxx         xx  x  xx        x  xxx         xx xx     
    xxx  xxx         xxx         xx  x  xx        x   xx         xx xx     
    xxx  xxx         x xx            x            x   xx           xxx     
    xxxx x x        xx xx            x            x   xxx         xxxx     
    x xxxx x        xxxxxx           x            x   xx            xxx    
    x xxx  x       xx   xx           x            x   xx         xx  xx    
    x  xx  x       xx   xx           x            x  xxx         xx xxx    
  xxxx xxxxxx     xxx  xxxxx       xxxxx        xxxxxxx          xxxxx 
        ''')

    def choose_action(self, s, evaluation=False):
        return self._get_action(s, evaluation).numpy()

    def get_target_action(self, s):
        return self._get_target_action(s).numpy()

    @tf.function
    def _get_action(self, vector_input, evaluation):
        vector_input = self.cast(vector_input)
        with tf.device(self.device):
            mu = self.actor_net(vector_input)
            if evaluation == True:
                return mu
            else:
                return tf.clip_by_value(mu + self.action_noise(), -1, 1)

    @tf.function
    def _get_target_action(self, vector_input):
        vector_input = self.cast(vector_input)
        with tf.device(self.device):
            target_mu = self.actor_target_net(vector_input)
        return tf.clip_by_value(target_mu + self.action_noise(), -1, 1)

    def learn(self, episode, ap, al, ss, ss_, aa, aa_, s, r):
        ap, al, ss, ss_, aa, aa_, s, r = map(self.data_convert,
                                             (ap, al, ss, ss_, aa, aa_, s, r))
        summaries = self.train(ap, al, ss, ss_, aa, aa_, s, r)
        self.update_target_net_weights(
            self.actor_target_net.weights + self.critic_target_net.weights,
            self.actor_net.weights + self.critic_net.weights, self.ployak)
        summaries.update(
            dict([['LEARNING_RATE/actor_lr',
                   self.actor_lr(self.train_step)],
                  ['LEARNING_RATE/critic_lr',
                   self.critic_lr(self.train_step)]]))
        self.write_training_summaries(self.global_step, summaries)

    @tf.function(experimental_relax_shapes=True)
    def train(self, q_actor_a_previous, q_actor_a_later, ss, ss_, aa, aa_, s,
              r):
        with tf.device(self.device):
            for _ in range(2):
                with tf.GradientTape() as tape:
                    q1, q2 = self.critic_net(ss, aa)
                    q_target = self.critic_target_net.get_min(ss_, aa_)
                    dc_r = tf.stop_gradient(r + self.gamma * q_target)
                    td_error1 = q1 - dc_r
                    td_error2 = q2 - dc_r
                    q1_loss = tf.reduce_mean(tf.square(td_error1))
                    q2_loss = tf.reduce_mean(tf.square(td_error2))
                    critic_loss = 0.5 * (q1_loss + q2_loss)
                critic_grads = tape.gradient(
                    critic_loss, self.critic_net.trainable_variables)
                self.optimizer_critic.apply_gradients(
                    zip(critic_grads, self.critic_net.trainable_variables))
            with tf.GradientTape() as tape:
                mu = self.actor_net(s)
                mumu = tf.concat((q_actor_a_previous, mu, q_actor_a_later),
                                 axis=1)
                q1_actor = self.critic_net.Q1(ss, mumu)
                actor_loss = -tf.reduce_mean(q1_actor)
            actor_grads = tape.gradient(actor_loss,
                                        self.actor_net.trainable_variables)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_net.trainable_variables))
            self.global_step.assign_add(1)
            return dict([['LOSS/actor_loss', actor_loss],
                         ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss],
                         ['LOSS/critic_loss', critic_loss]])

    @tf.function(experimental_relax_shapes=True)
    def train_persistent(self, q_actor_a_previous, q_actor_a_later, ss, ss_,
                         aa, aa_, s, r):
        with tf.device(self.device):
            for _ in range(2):
                with tf.GradientTape(persistent=True) as tape:
                    mu = self.actor_net(s)
                    mumu = tf.concat((q_actor_a_previous, mu, q_actor_a_later),
                                     axis=1)
                    q1, q2 = self.critic_net(ss, aa)
                    q_target = self.critic_target_net.get_min(ss_, aa_)
                    q1_actor = self.critic_net.Q1(ss, mumu)
                    dc_r = tf.stop_gradient(r + self.gamma * q_target)
                    td_error1 = q1 - dc_r
                    td_error2 = q2 - dc_r
                    q1_loss = tf.reduce_mean(tf.square(td_error1))
                    q2_loss = tf.reduce_mean(tf.square(td_error2))
                    critic_loss = 0.5 * (q1_loss + q2_loss)
                    actor_loss = -tf.reduce_mean(q1_actor)
                critic_grads = tape.gradient(
                    critic_loss, self.critic_net.trainable_variables)
                self.optimizer_critic.apply_gradients(
                    zip(critic_grads, self.critic_net.trainable_variables))
            actor_grads = tape.gradient(actor_loss,
                                        self.actor_net.trainable_variables)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_net.trainable_variables))
            self.global_step.assign_add(1)
            return dict([['LOSS/actor_loss', actor_loss],
                         ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss],
                         ['LOSS/critic_loss', critic_loss]])
Esempio n. 2
0
class TD3(make_off_policy_class(mode='share')):
    '''
    Twin Delayed Deep Deterministic Policy Gradient, https://arxiv.org/abs/1802.09477
    '''
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 ployak=0.995,
                 delay_num=2,
                 noise_type='gaussian',
                 gaussian_noise_sigma=0.2,
                 gaussian_noise_bound=0.2,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 discrete_tau=1.0,
                 hidden_units={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.ployak = ployak
        self.delay_num = delay_num
        self.discrete_tau = discrete_tau
        self.gaussian_noise_sigma = gaussian_noise_sigma
        self.gaussian_noise_bound = gaussian_noise_bound

        if self.is_continuous:

            def _actor_net():
                return rls.actor_dpg(self.feat_dim, self.a_dim,
                                     hidden_units['actor_continuous'])

            if noise_type == 'gaussian':
                self.action_noise = rls.ClippedNormalActionNoise(
                    mu=np.zeros(self.a_dim),
                    sigma=self.gaussian_noise_sigma * np.ones(self.a_dim),
                    bound=self.gaussian_noise_bound)
            elif noise_type == 'ou':
                self.action_noise = rls.OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim))
        else:

            def _actor_net():
                return rls.actor_discrete(self.feat_dim, self.a_dim,
                                          hidden_units['actor_discrete'])

            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)

        self.actor_net = _actor_net()
        self.actor_target_net = _actor_net()
        self.actor_tv = self.actor_net.trainable_variables

        def _q_net():
            return rls.critic_q_one(self.feat_dim, self.a_dim,
                                    hidden_units['q'])

        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv

        self.update_target_net_weights(
            self.actor_target_net.weights + self.critic_target_net.weights,
            self.actor_net.weights + self.critic_net.weights)
        self.actor_lr, self.critic_lr = map(self.init_lr,
                                            [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(
            self.init_optimizer, [self.actor_lr, self.critic_lr])

        self.model_recorder(
            dict(actor=self.actor_net,
                 critic_net=self.critic_net,
                 optimizer_actor=self.optimizer_actor,
                 optimizer_critic=self.optimizer_critic))

    def show_logo(self):
        self.recorder.logger.info('''
   xxxxxxxxx      xxxxxxx          xxxxx     
   xx  x  xx        x  xxx         xx xx     
   xx  x  xx        x   xx         xx xx     
       x            x   xx           xxx     
       x            x   xxx         xxxx     
       x            x   xx            xxx    
       x            x   xx         xx  xx    
       x            x  xxx         xx xxx    
     xxxxx        xxxxxxx          xxxxx 
        ''')

    def choose_action(self, s, visual_s, evaluation=False):
        mu, pi, self.cell_state = self._get_action(s, visual_s,
                                                   self.cell_state)
        a = mu.numpy() if evaluation else pi.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            if self.is_continuous:
                mu = self.actor_net(feat)
                pi = tf.clip_by_value(mu + self.action_noise(), -1, 1)
            else:
                logits = self.actor_net(feat)
                mu = tf.argmax(logits, axis=1)
                cate_dist = tfp.distributions.Categorical(logits)
                pi = cate_dist.sample()
            return mu, pi, cell_state

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')
        for i in range(kwargs['step']):
            self._learn(
                function_dict={
                    'train_function':
                    self.train,
                    'update_function':
                    lambda: self.update_target_net_weights(
                        self.actor_target_net.weights + self.critic_target_net.
                        weights, self.actor_net.weights + self.critic_net.
                        weights, self.ployak),
                    'summary_dict':
                    dict([[
                        'LEARNING_RATE/actor_lr',
                        self.actor_lr(self.train_step)
                    ],
                          [
                              'LEARNING_RATE/critic_lr',
                              self.critic_lr(self.train_step)
                          ]])
                })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            for _ in range(self.delay_num):
                with tf.GradientTape() as tape:
                    feat, feat_ = self.get_feature(ss,
                                                   vvss,
                                                   cell_state=cell_state,
                                                   s_and_s_=True)
                    if self.is_continuous:
                        target_mu = self.actor_target_net(feat_)
                        action_target = tf.clip_by_value(
                            target_mu + self.action_noise(), -1, 1)
                    else:
                        target_logits = self.actor_target_net(feat_)
                        logp_all = tf.nn.log_softmax(target_logits)
                        gumbel_noise = tf.cast(self.gumbel_dist.sample(
                            [batch_size, self.a_dim]),
                                               dtype=tf.float32)
                        _pi = tf.nn.softmax(
                            (logp_all + gumbel_noise) / self.discrete_tau)
                        _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                      self.a_dim)
                        _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                        action_target = _pi_diff + _pi
                    q1, q2 = self.critic_net(feat, a)
                    q_target = self.critic_target_net.get_min(
                        feat_, action_target)
                    dc_r = tf.stop_gradient(r + self.gamma * q_target *
                                            (1 - done))
                    td_error1 = q1 - dc_r
                    td_error2 = q2 - dc_r
                    q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                    q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                    critic_loss = 0.5 * (q1_loss + q2_loss) + crsty_loss
                critic_grads = tape.gradient(critic_loss, self.critic_tv)
                self.optimizer_critic.apply_gradients(
                    zip(critic_grads, self.critic_tv))
            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu = self.actor_net(feat)
                else:
                    logits = self.actor_net(feat)
                    _pi = tf.nn.softmax(logits)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1),
                                                  self.a_dim,
                                                  dtype=tf.float32)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    mu = _pi_diff + _pi
                q1_actor = self.critic_net.Q1(feat, mu)
                actor_loss = -tf.reduce_mean(q1_actor)
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv))
            self.global_step.assign_add(1)
            return (td_error1 + td_error2) / 2, dict([
                ['LOSS/actor_loss', actor_loss],
                ['LOSS/critic_loss', critic_loss],
                ['Statistics/q_min',
                 tf.reduce_min(tf.minimum(q1, q2))],
                ['Statistics/q_mean',
                 tf.reduce_mean(tf.minimum(q1, q2))],
                ['Statistics/q_max',
                 tf.reduce_max(tf.maximum(q1, q2))],
            ])

    @tf.function(experimental_relax_shapes=True)
    def train_persistent(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            for _ in range(2):
                with tf.GradientTape(persistent=True) as tape:
                    feat, feat_ = self.get_feature(ss,
                                                   vvss,
                                                   cell_state=cell_state,
                                                   s_and_s_=True)
                    if self.is_continuous:
                        target_mu = self.actor_target_net(feat_)
                        action_target = tf.clip_by_value(
                            target_mu + self.action_noise(), -1, 1)
                        mu = self.actor_net(feat)
                    else:
                        target_logits = self.actor_target_net(feat_)
                        logp_all = tf.nn.log_softmax(target_logits)
                        gumbel_noise = tf.cast(self.gumbel_dist.sample(
                            [batch_size, self.a_dim]),
                                               dtype=tf.float32)
                        _pi = tf.nn.softmax(
                            (logp_all + gumbel_noise) / self.discrete_tau)
                        _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                      self.a_dim)
                        _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                        action_target = _pi_diff + _pi
                        logits = self.actor_net(feat)
                        _pi = tf.nn.softmax(logits)
                        _pi_true_one_hot = tf.one_hot(tf.argmax(logits,
                                                                axis=-1),
                                                      self.a_dim,
                                                      dtype=tf.float32)
                        _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                        mu = _pi_diff + _pi
                    q1, q2 = self.critic_net(feat, a)
                    q_target = self.critic_target_net.get_min(
                        feat_, action_target)
                    q1_actor = self.critic_net.Q1(feat, mu)
                    dc_r = tf.stop_gradient(r + self.gamma * q_target *
                                            (1 - done))
                    td_error1 = q1 - dc_r
                    td_error2 = q2 - dc_r
                    q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                    q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                    critic_loss = 0.5 * (q1_loss + q2_loss) + crsty_loss
                    actor_loss = -tf.reduce_mean(q1_actor)
                critic_grads = tape.gradient(critic_loss, self.critic_tv)
                self.optimizer_critic.apply_gradients(
                    zip(critic_grads, self.critic_tv))
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv))
            self.global_step.assign_add(1)
            return (td_error1 + td_error2) / 2, dict(
                [['LOSS/actor_loss', actor_loss],
                 ['LOSS/critic_loss', critic_loss],
                 ['Statistics/q_min',
                  tf.reduce_min(tf.minimum(q1, q2))],
                 ['Statistics/q_mean',
                  tf.reduce_mean(tf.minimum(q1, q2))],
                 ['Statistics/q_max',
                  tf.reduce_max(tf.maximum(q1, q2))]])
Esempio n. 3
0
class SAC_V(make_off_policy_class(mode='share')):
    """
        Soft Actor Critic with Value neural network. https://arxiv.org/abs/1812.05905
        Soft Actor-Critic for Discrete Action Settings. https://arxiv.org/abs/1910.07207
    """
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            alpha=0.2,
            annealing=True,
            last_alpha=0.01,
            ployak=0.995,
            use_gumbel=True,
            discrete_tau=1.0,
            log_std_bound=[-20, 2],
            hidden_units={
                'actor_continuous': {
                    'share': [128, 128],
                    'mu': [64],
                    'log_std': [64]
                },
                'actor_discrete': [64, 32],
                'q': [128, 128],
                'v': [128, 128]
            },
            actor_lr=5.0e-4,
            critic_lr=1.0e-3,
            alpha_lr=5.0e-4,
            auto_adaption=True,
            **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.ployak = ployak
        self.use_gumbel = use_gumbel
        self.discrete_tau = discrete_tau
        self.log_std_min, self.log_std_max = log_std_bound[:]
        self.auto_adaption = auto_adaption
        self.annealing = annealing

        if self.auto_adaption:
            self.log_alpha = tf.Variable(initial_value=0.0,
                                         name='log_alpha',
                                         dtype=tf.float32,
                                         trainable=True)
        else:
            self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha),
                                         name='log_alpha',
                                         dtype=tf.float32,
                                         trainable=False)
            if self.annealing:
                self.alpha_annealing = LinearAnnealing(alpha, last_alpha, 1e6)

        if self.is_continuous:
            self.actor_net = rls.actor_continuous(
                self.feat_dim, self.a_dim, hidden_units['actor_continuous'])
        else:
            self.actor_net = rls.actor_discrete(self.feat_dim, self.a_dim,
                                                hidden_units['actor_discrete'])
            if self.use_gumbel:
                self.gumbel_dist = tfp.distributions.Gumbel(0, 1)

        self.actor_tv = self.actor_net.trainable_variables
        # entropy = -log(1/|A|) = log |A|
        self.target_entropy = 0.98 * (self.a_dim if self.is_continuous else
                                      np.log(self.a_dim))

        if self.is_continuous or self.use_gumbel:
            critic_net = rls.critic_q_one
        else:
            critic_net = rls.critic_q_all

        _q_net = lambda: critic_net(self.feat_dim, self.a_dim, hidden_units['q'
                                                                            ])
        _v_net = lambda: rls.critic_v(self.feat_dim, hidden_units['v'])
        self.q_net = DoubleQ(_q_net)
        self.v_net = _v_net()
        self.v_target_net = _v_net()
        self.critic_tv = self.q_net.trainable_variables + self.v_net.trainable_variables + self.other_tv

        self.update_target_net_weights(self.v_target_net.weights,
                                       self.v_net.weights)
        self.actor_lr, self.critic_lr, self.alpha_lr = map(
            self.init_lr, [actor_lr, critic_lr, alpha_lr])
        self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha = map(
            self.init_optimizer,
            [self.actor_lr, self.critic_lr, self.alpha_lr])

        self.model_recorder(
            dict(
                actor=self.actor_net,
                q_net=self.q_net,
                v_net=self.v_net,
                optimizer_actor=self.optimizer_actor,
                optimizer_critic=self.optimizer_critic,
                optimizer_alpha=self.optimizer_alpha,
            ))

    def show_logo(self):
        self.recorder.logger.info('''
    xxxxxxx           xx           xxxxxx                      xxxx  xxx   
    xx   xx          xxx          xxx  xx                      xxx    x    
    xx    x          xxx          xx    xx                      xx   xx    
    xxxx             x xx         xx           xx  xx  xx       xxx  xx    
     xxxxxx         xx xx        xxx           xx  xx  xx        xx xx     
        xxx         xxxxxx       xxx           xx  xx  xx        xxxxx     
    x    xx        xx   xx        xx    xx                        xxx      
    xx   xx        xx   xx        xxx  xxx                        xxx      
    xxxxxxx       xxx  xxxxx       xxxxxx                          x       
                                                                   x                                                               
        ''')

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def choose_action(self, s, visual_s, evaluation=False):
        mu, pi, self.cell_state = self._get_action(s, visual_s,
                                                   self.cell_state)
        a = mu.numpy() if evaluation else pi.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            if self.is_continuous:
                mu, log_std = self.actor_net(feat)
                log_std = clip_nn_log_std(log_std, self.log_std_min,
                                          self.log_std_max)
                pi, _ = squash_rsample(mu, log_std)
                mu = tf.tanh(mu)  # squash mu
            else:
                logits = self.actor_net(feat)
                mu = tf.argmax(logits, axis=1)
                cate_dist = tfp.distributions.Categorical(logits)
                pi = cate_dist.sample()
            return mu, pi, cell_state

    def learn(self, **kwargs):
        self.episode = kwargs['episode']

        def _train(memories, isw, crsty_loss, cell_state):
            if self.is_continuous or self.use_gumbel:
                td_error, summaries = self.train(memories, isw, crsty_loss,
                                                 cell_state)
            else:
                td_error, summaries = self.train_discrete(
                    memories, isw, crsty_loss, cell_state)
            if self.annealing and not self.auto_adaption:
                self.log_alpha.assign(
                    tf.math.log(
                        tf.cast(self.alpha_annealing(self.global_step.numpy()),
                                tf.float32)))
            return td_error, summaries

        for i in range(kwargs['step']):
            self._learn(
                function_dict={
                    'train_function':
                    _train,
                    'update_function':
                    lambda: self.update_target_net_weights(
                        self.v_target_net.weights, self.v_net.weights, self.
                        ployak),
                    'summary_dict':
                    dict([[
                        'LEARNING_RATE/actor_lr',
                        self.actor_lr(self.episode)
                    ], [
                        'LEARNING_RATE/critic_lr',
                        self.critic_lr(self.episode)
                    ], ['LEARNING_RATE/alpha_lr',
                        self.alpha_lr(self.episode)]])
                })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                if self.is_continuous:
                    mu, log_std = self.actor_net(feat)
                    log_std = clip_nn_log_std(log_std, self.log_std_min,
                                              self.log_std_max)
                    pi, log_pi = squash_rsample(mu, log_std)
                else:
                    logits = self.actor_net(feat)
                    cate_dist = tfp.distributions.Categorical(logits)
                    pi = cate_dist.sample()
                    log_pi = cate_dist.log_prob(pi)
                    pi = tf.one_hot(pi, self.a_dim, dtype=tf.float32)
                q1, q2 = self.q_net(feat, a)
                q_pi = self.q_net.get_min(feat, pi)
                v = self.v_net(feat)
                v_target = self.v_target_net(feat_)
                dc_r = tf.stop_gradient(r + self.gamma * v_target * (1 - done))
                v_from_q_stop = tf.stop_gradient(q_pi - self.alpha * log_pi)
                td_v = v - v_from_q_stop
                td_error1 = q1 - dc_r
                td_error2 = q2 - dc_r
                q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                v_loss_stop = tf.reduce_mean(tf.square(td_v) * isw)
                critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + 0.5 * v_loss_stop + crsty_loss
            critic_grads = tape.gradient(critic_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv))
            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu, log_std = self.actor_net(feat)
                    log_std = clip_nn_log_std(log_std, self.log_std_min,
                                              self.log_std_max)
                    pi, log_pi = squash_rsample(mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(
                        [batch_size, self.a_dim]),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax(
                        (logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    pi = _pi_diff + _pi
                    log_pi = tf.reduce_sum(tf.multiply(logp_all, pi),
                                           axis=1,
                                           keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                q1_pi = self.q_net.Q1(feat, pi)
                actor_loss = -tf.reduce_mean(q1_pi - self.alpha * log_pi)
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv))
            if self.auto_adaption:
                with tf.GradientTape() as tape:
                    if self.is_continuous:
                        mu, log_std = self.actor_net(feat)
                        log_std = clip_nn_log_std(log_std, self.log_std_min,
                                                  self.log_std_max)
                        # pi, log_pi = squash_rsample(mu, log_std)
                        norm_dist = tfp.distributions.Normal(
                            loc=mu, scale=tf.exp(log_std))
                        log_pi = tf.reduce_sum(norm_dist.log_prob(
                            norm_dist.sample()),
                                               axis=-1)
                    else:
                        logits = self.actor_net(feat)
                        cate_dist = tfp.distributions.Categorical(logits)
                        log_pi = cate_dist.log_prob(cate_dist.sample())
                    alpha_loss = -tf.reduce_mean(
                        self.alpha *
                        tf.stop_gradient(log_pi - self.target_entropy))
                alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
                self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                       self.log_alpha)])
            self.global_step.assign_add(1)
            summaries = dict(
                [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss],
                 ['LOSS/q2_loss', q2_loss], ['LOSS/v_loss', v_loss_stop],
                 ['LOSS/critic_loss', critic_loss],
                 ['Statistics/log_alpha', self.log_alpha],
                 ['Statistics/alpha', self.alpha],
                 ['Statistics/entropy', entropy],
                 ['Statistics/q_min',
                  tf.reduce_min(tf.minimum(q1, q2))],
                 ['Statistics/q_mean',
                  tf.reduce_mean(tf.minimum(q1, q2))],
                 ['Statistics/q_max',
                  tf.reduce_max(tf.maximum(q1, q2))],
                 ['Statistics/v_mean', tf.reduce_mean(v)]])
            if self.auto_adaption:
                summaries.update({'LOSS/alpha_loss': alpha_loss})
            return (td_error1 + td_error2) / 2, summaries

    @tf.function(experimental_relax_shapes=True)
    def train_persistent(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        batch_size = tf.shape(a)[0]
        with tf.device(self.device):
            with tf.GradientTape(persistent=True) as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                if self.is_continuous:
                    mu, log_std = self.actor_net(feat)
                    log_std = clip_nn_log_std(log_std, self.log_std_min,
                                              self.log_std_max)
                    pi, log_pi = squash_rsample(mu, log_std)
                    entropy = gaussian_entropy(log_std)
                else:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(
                        [batch_size, self.a_dim]),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax(
                        (logp_all + gumbel_noise) / self.discrete_tau)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    pi = _pi_diff + _pi
                    log_pi = tf.reduce_sum(tf.multiply(logp_all, pi),
                                           axis=1,
                                           keepdims=True)
                    entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                      axis=1,
                                      keepdims=True))
                q1, q2 = self.q_net(feat, a)
                v = self.v_net(feat)
                q1_pi, q2_pi = self.q_net(feat, pi)
                v_target = self.v_target_net(feat_)
                dc_r = tf.stop_gradient(r + self.gamma * v_target * (1 - done))
                v_from_q_stop = tf.stop_gradient(
                    tf.minimum(q1_pi, q2_pi) - self.alpha * log_pi)
                td_v = v - v_from_q_stop
                td_error1 = q1 - dc_r
                td_error2 = q2 - dc_r
                q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                v_loss_stop = tf.reduce_mean(tf.square(td_v) * isw)
                critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + 0.5 * v_loss_stop + crsty_loss
                actor_loss = -tf.reduce_mean(q1_pi - self.alpha * log_pi)
                if self.auto_adaption:
                    alpha_loss = -tf.reduce_mean(
                        self.alpha *
                        tf.stop_gradient(log_pi - self.target_entropy))
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv))
            critic_grads = tape.gradient(critic_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv))
            if self.auto_adaption:
                alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
                self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                       self.log_alpha)])
            self.global_step.assign_add(1)
            summaries = dict(
                [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss],
                 ['LOSS/q2_loss', q2_loss], ['LOSS/v_loss', v_loss_stop],
                 ['LOSS/critic_loss', critic_loss],
                 ['Statistics/log_alpha', self.log_alpha],
                 ['Statistics/alpha', self.alpha],
                 ['Statistics/entropy', entropy],
                 ['Statistics/q_min',
                  tf.reduce_min(tf.minimum(q1, q2))],
                 ['Statistics/q_mean',
                  tf.reduce_mean(tf.minimum(q1, q2))],
                 ['Statistics/q_max',
                  tf.reduce_max(tf.maximum(q1, q2))],
                 ['Statistics/v_mean', tf.reduce_mean(v)]])
            if self.auto_adaption:
                summaries.update({'LOSS/alpha_loss': alpha_loss})
            return (td_error1 + td_error2) / 2, summaries

    @tf.function(experimental_relax_shapes=True)
    def train_discrete(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                q1_all, q2_all = self.q_net(feat)  # [B, A]
                q_function = lambda x: tf.reduce_sum(
                    x * a, axis=-1, keepdims=True)  #[B, 1]
                q1 = q_function(q1_all)
                q2 = q_function(q2_all)
                logits = self.actor_net(feat)  #[B, A]
                logp_all = tf.nn.log_softmax(logits)  #[B, A]
                v = self.v_net(feat)  # [B, 1]
                v_target = self.v_target_net(feat_)  # [B, 1]
                dc_r = tf.stop_gradient(r + self.gamma * v_target * (1 - done))
                td_v = v - tf.stop_gradient(
                    tf.minimum(
                        tf.reduce_sum(tf.exp(logp_all) * q1_all, axis=-1),
                        tf.reduce_sum(tf.exp(logp_all) * q2_all, axis=-1)))
                td_error1 = q1 - dc_r
                td_error2 = q2 - dc_r
                q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                v_loss_stop = tf.reduce_mean(tf.square(td_v) * isw)
                critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + 0.5 * v_loss_stop + crsty_loss
            critic_grads = tape.gradient(critic_loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(critic_grads, self.critic_tv))
            with tf.GradientTape() as tape:
                logits = self.actor_net(feat)
                logp_all = tf.nn.log_softmax(logits)
                entropy = -tf.reduce_mean(
                    tf.reduce_sum(
                        tf.exp(logp_all) * logp_all, axis=1, keepdims=True))
                q_all = self.q_net.get_min(feat)  # [B, A]
                actor_loss = -tf.reduce_mean(
                    tf.reduce_sum((q_all - self.alpha * logp_all) *
                                  tf.exp(logp_all))  # [B, A] => [B,]
                )
            actor_grads = tape.gradient(actor_loss, self.actor_tv)
            self.optimizer_actor.apply_gradients(
                zip(actor_grads, self.actor_tv))
            if self.auto_adaption:
                with tf.GradientTape() as tape:
                    logits = self.actor_net(feat)
                    logp_all = tf.nn.log_softmax(logits)
                    entropy = -tf.reduce_sum(tf.exp(logp_all) * logp_all,
                                             axis=1,
                                             keepdims=True)  # [B, 1]
                    corr = tf.stop_gradient(-entropy - self.target_entropy)
                    # corr = tf.stop_gradient(tf.reduce_sum((logp_all - self.a_dim) * tf.exp(logp_all), axis=-1))    #[B, A] => [B,]
                    alpha_loss = -tf.reduce_mean(self.alpha * corr)
                alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
                self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                       self.log_alpha)])
            self.global_step.assign_add(1)
            summaries = dict([['LOSS/actor_loss', actor_loss],
                              ['LOSS/q1_loss', q1_loss],
                              ['LOSS/q2_loss', q2_loss],
                              ['LOSS/v_loss', v_loss_stop],
                              ['LOSS/critic_loss', critic_loss],
                              ['Statistics/log_alpha', self.log_alpha],
                              ['Statistics/alpha', self.alpha],
                              ['Statistics/entropy',
                               tf.reduce_mean(entropy)],
                              ['Statistics/v_mean',
                               tf.reduce_mean(v)]])
            if self.auto_adaption:
                summaries.update({'LOSS/alpha_loss': alpha_loss})
            return (td_error1 + td_error2) / 2, summaries
Esempio n. 4
0
class HIRO(make_off_policy_class(mode='no_share')):
    '''
    Data-Efficient Hierarchical Reinforcement Learning, http://arxiv.org/abs/1805.08296
    '''
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            ployak=0.995,
            high_scale=1.0,
            reward_scale=1.0,
            sample_g_nums=100,
            sub_goal_steps=10,
            fn_goal_dim=0,
            intrinsic_reward_mode='os',
            high_batch_size=256,
            high_buffer_size=100000,
            low_batch_size=8,
            low_buffer_size=10000,
            high_actor_lr=1.0e-4,
            high_critic_lr=1.0e-3,
            low_actor_lr=1.0e-4,
            low_critic_lr=1.0e-3,
            hidden_units={
                'high_actor': [64, 64],
                'high_critic': [64, 64],
                'low_actor': [64, 64],
                'low_critic': [64, 64]
            },
            **kwargs):
        assert visual_sources == 0, 'HIRO doesn\'t support visual inputs.'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.data_high = ExperienceReplay(high_batch_size, high_buffer_size)
        self.data_low = ExperienceReplay(low_batch_size, low_buffer_size)

        self.ployak = ployak
        self.high_scale = np.array(
            high_scale if isinstance(high_scale, list) else [high_scale] *
            self.s_dim,
            dtype=np.float32)
        self.reward_scale = reward_scale
        self.fn_goal_dim = fn_goal_dim
        self.sample_g_nums = sample_g_nums
        self.sub_goal_steps = sub_goal_steps
        self.sub_goal_dim = self.s_dim - self.fn_goal_dim

        self.high_noise = rls.ClippedNormalActionNoise(
            mu=np.zeros(self.sub_goal_dim),
            sigma=self.high_scale * np.ones(self.sub_goal_dim),
            bound=self.high_scale / 2)
        self.low_noise = rls.ClippedNormalActionNoise(mu=np.zeros(self.a_dim),
                                                      sigma=1.0 *
                                                      np.ones(self.a_dim),
                                                      bound=0.5)

        _high_actor_net = lambda: rls.actor_dpg(self.s_dim, self.sub_goal_dim,
                                                hidden_units['high_actor'])
        if self.is_continuous:
            _low_actor_net = lambda: rls.actor_dpg(
                self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[
                    'low_actor'])
        else:
            _low_actor_net = lambda: rls.actor_discrete(
                self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[
                    'low_actor'])
            self.gumbel_dist = tfd.Gumbel(0, 1)

        self.high_actor = _high_actor_net()
        self.high_actor_target = _high_actor_net()
        self.low_actor = _low_actor_net()
        self.low_actor_target = _low_actor_net()

        _high_critic_net = lambda: rls.critic_q_one(
            self.s_dim, self.sub_goal_dim, hidden_units['high_critic'])
        _low_critic_net = lambda: rls.critic_q_one(
            self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[
                'low_critic'])

        self.high_critic = DoubleQ(_high_critic_net)
        self.high_critic_target = DoubleQ(_high_critic_net)
        self.low_critic = DoubleQ(_low_critic_net)
        self.low_critic_target = DoubleQ(_low_critic_net)

        self.update_target_net_weights(
            self.low_actor_target.weights + self.low_critic_target.weights +
            self.high_actor_target.weights + self.high_critic_target.weights,
            self.low_actor.weights + self.low_critic.weights +
            self.high_actor.weights + self.high_critic.weights)

        self.low_actor_lr, self.low_critic_lr = map(
            self.init_lr, [low_actor_lr, low_critic_lr])
        self.high_actor_lr, self.high_critic_lr = map(
            self.init_lr, [high_actor_lr, high_critic_lr])
        self.low_actor_optimizer, self.low_critic_optimizer = map(
            self.init_optimizer, [self.low_actor_lr, self.low_critic_lr])
        self.high_actor_optimizer, self.high_critic_optimizer = map(
            self.init_optimizer, [self.high_actor_lr, self.high_critic_lr])

        self.model_recorder(
            dict(high_actor=self.high_actor,
                 high_critic=self.high_critic,
                 low_actor=self.low_actor,
                 low_critic=self.low_critic,
                 low_actor_optimizer=self.low_actor_optimizer,
                 low_critic_optimizer=self.low_critic_optimizer,
                 high_actor_optimizer=self.high_actor_optimizer,
                 high_critic_optimizer=self.high_critic_optimizer))

        self.counts = 0
        self._high_s = [[] for _ in range(self.n_agents)]
        self._noop_subgoal = np.random.uniform(-self.high_scale,
                                               self.high_scale,
                                               size=(self.n_agents,
                                                     self.sub_goal_dim))
        self.get_ir = self.generate_ir_func(mode=intrinsic_reward_mode)

    def generate_ir_func(self, mode='os'):
        if mode == 'os':
            return lambda last_feat, subgoal, feat: -tf.norm(
                last_feat + subgoal - feat, ord=2, axis=-1, keepdims=True)
        elif mode == 'cos':
            return lambda last_feat, subgoal, feat: tf.expand_dims(
                -tf.keras.losses.cosine_similarity(
                    tf.cast(feat - last_feat, tf.float32),
                    tf.cast(subgoal, tf.float32),
                    axis=-1),
                axis=-1)

    def show_logo(self):
        self.recorder.logger.info('''
  xxxxx xxxxx        xxxx        xxxxxxx          xxxxxx    
    xx   xx           xx          xxxxxxx        xxx xxxx   
    xx   xx           xx          xx  xxx       xxx   xxx   
    xx   xx           xx          xx  xxx       xx     xxx  
    xxxxxxx           xx          xxxxxx        xx     xxx  
    xx   xx           xx          xxxxxx        xx     xxx  
    xx   xx           xx          xx xxxx       xx     xxx  
    xx   xx           xx          xx  xxx       xxx   xxx   
  xxxxx xxxxx        xxxx        xxxxx xxxx      xxxxxxx   
        ''')

    def store_high_buffer(self, i):
        eps_len = len(self._high_s[i])
        intervals = list(range(0, eps_len, self.sub_goal_steps))
        if len(intervals) < 1:
            return
        left = intervals[:-1]
        right = intervals[1:]
        s, r, a, g, d, s_ = [], [], [], [], [], []
        for _l, _r in zip(left, right):
            s.append(self._high_s[i][_l:_r])
            r.append(sum(self._high_r[i][_l:_r]) * self.reward_scale)
            a.append(self._high_a[i][_l:_r])
            g.append(self._subgoals[i][_l])
            d.append(self._done[i][_r - 1])
            s_.append(self._high_s_[i][_r - 1])

        right = intervals[-1]
        s.append(self._high_s[i][right:eps_len] + [self._high_s[i][-1]] *
                 (self.sub_goal_steps + right - eps_len))
        r.append(sum(self._high_r[i][right:eps_len]))
        a.append(self._high_a[i][right:eps_len] + [self._high_a[i][-1]] *
                 (self.sub_goal_steps + right - eps_len))
        g.append(self._subgoals[i][right])
        d.append(self._done[i][-1])
        s_.append(self._high_s_[i][-1])
        self.data_high.add(np.array(s),
                           np.array(r)[:, np.newaxis], np.array(a),
                           np.array(g),
                           np.array(d)[:, np.newaxis], np.array(s_))

    def reset(self):
        self._c = np.full((self.n_agents, 1), self.sub_goal_steps, np.int32)

        for i in range(self.n_agents):
            self.store_high_buffer(i)
        self._high_r = [[] for _ in range(self.n_agents)]
        self._high_a = [[] for _ in range(self.n_agents)]
        self._high_s = [[] for _ in range(self.n_agents)]
        self._subgoals = [[] for _ in range(self.n_agents)]
        self._done = [[] for _ in range(self.n_agents)]
        self._high_s_ = [[] for _ in range(self.n_agents)]

        self._new_subgoal = np.zeros((self.n_agents, self.sub_goal_dim),
                                     dtype=np.float32)

    def partial_reset(self, done):
        self._c = np.where(
            done[:, np.newaxis],
            np.full((self.n_agents, 1), self.sub_goal_steps, np.int32),
            self._c)
        idx = np.where(done)[0]
        for i in idx:
            self.store_high_buffer(i)
            self._high_s[i] = []
            self._high_a[i] = []
            self._high_s_[i] = []
            self._high_r[i] = []
            self._done[i] = []
            self._subgoals[i] = []

    @tf.function
    def _get_action(self, s, visual_s, subgoal):
        with tf.device(self.device):
            feat = tf.concat([s, subgoal], axis=-1)
            if self.is_continuous:
                mu = self.low_actor(feat)
                pi = tf.clip_by_value(mu + self.low_noise(), -1, 1)
            else:
                logits = self.low_actor(feat)
                mu = tf.argmax(logits, axis=1)
                cate_dist = tfd.Categorical(logits)
                pi = cate_dist.sample()
            return mu, pi

    def choose_action(self, s, visual_s, evaluation=False):
        self._subgoal = np.where(self._c == self.sub_goal_steps,
                                 self.get_subgoal(s).numpy(),
                                 self._new_subgoal)
        mu, pi = self._get_action(s, visual_s, self._subgoal)
        a = mu.numpy() if evaluation else pi.numpy()
        return a

    @tf.function
    def get_subgoal(self, s):
        '''
        last_s 上一个隐状态
        subgoal 上一个子目标
        s 当前隐状态
        '''
        new_subgoal = self.high_scale * self.high_actor(s)
        new_subgoal = tf.clip_by_value(new_subgoal + self.high_noise(),
                                       -self.high_scale, self.high_scale)
        return new_subgoal

    def learn(self, **kwargs):
        self.episode = kwargs['episode']
        for i in range(kwargs['step']):
            if self.data_low.is_lg_batch_size and self.data_high.is_lg_batch_size:
                self.intermediate_variable_reset()
                low_data = self.get_transitions(
                    self.data_low,
                    data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_'])
                high_data = self.get_transitions(
                    self.data_high,
                    data_name_list=['s', 'r', 'a', 'g', 'done', 's_'])

                # --------------------------------------获取需要传给train函数的参数
                _low_training_data = self.get_value_from_dict(
                    data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_'],
                    data_dict=low_data)
                _high_training_data = self.get_value_from_dict(
                    data_name_list=['s', 'r', 'a', 'g', 'done', 's_'],
                    data_dict=high_data)
                summaries = self.train_low(_low_training_data)

                self.summaries.update(summaries)
                self.update_target_net_weights(
                    self.low_actor_target.weights +
                    self.low_critic_target.weights,
                    self.low_actor.weights + self.low_critic.weights,
                    self.ployak)
                if self.counts % self.sub_goal_steps == 0:
                    self.counts = 0
                    high_summaries = self.train_high(_high_training_data)
                    self.summaries.update(high_summaries)
                    self.update_target_net_weights(
                        self.high_actor_target.weights +
                        self.high_critic_target.weights,
                        self.high_actor.weights + self.high_critic.weights,
                        self.ployak)
                self.counts += 1
                self.summaries.update(
                    dict([[
                        'LEARNING_RATE/low_actor_lr',
                        self.low_actor_lr(self.episode)
                    ],
                          [
                              'LEARNING_RATE/low_critic_lr',
                              self.low_critic_lr(self.episode)
                          ],
                          [
                              'LEARNING_RATE/high_actor_lr',
                              self.high_actor_lr(self.episode)
                          ],
                          [
                              'LEARNING_RATE/high_critic_lr',
                              self.high_critic_lr(self.episode)
                          ]]))
                self.write_training_summaries(self.global_step, self.summaries)

    @tf.function(experimental_relax_shapes=True)
    def train_low(self, memories):
        s, a, r, s_, done, g, g_ = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat = tf.concat([s, g], axis=-1)
                feat_ = tf.concat([s_, g_], axis=-1)

                if self.is_continuous:
                    target_mu = self.low_actor_target(feat_)
                    action_target = tf.clip_by_value(
                        target_mu + self.low_noise(), -1, 1)
                else:
                    target_logits = self.low_actor_target(feat_)
                    logp_all = tf.nn.log_softmax(target_logits)
                    gumbel_noise = tf.cast(self.gumbel_dist.sample(
                        [tf.shape(feat_)[0], self.a_dim]),
                                           dtype=tf.float32)
                    _pi = tf.nn.softmax((logp_all + gumbel_noise) / 1.)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1),
                                                  self.a_dim)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    action_target = _pi_diff + _pi
                q1, q2 = self.low_critic(feat, a)
                q = tf.minimum(q1, q2)
                q_target = self.low_critic_target.get_min(feat_, action_target)
                dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done))
                td_error1 = q1 - dc_r
                td_error2 = q2 - dc_r
                q1_loss = tf.reduce_mean(tf.square(td_error1))
                q2_loss = tf.reduce_mean(tf.square(td_error2))
                low_critic_loss = q1_loss + q2_loss
            low_critic_grads = tape.gradient(low_critic_loss,
                                             self.low_critic.weights)
            self.low_critic_optimizer.apply_gradients(
                zip(low_critic_grads, self.low_critic.weights))
            with tf.GradientTape() as tape:
                if self.is_continuous:
                    mu = self.low_actor(feat)
                else:
                    logits = self.low_actor(feat)
                    _pi = tf.nn.softmax(logits)
                    _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1),
                                                  self.a_dim,
                                                  dtype=tf.float32)
                    _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi)
                    mu = _pi_diff + _pi
                q_actor = self.low_critic.Q1(feat, mu)
                low_actor_loss = -tf.reduce_mean(q_actor)
            low_actor_grads = tape.gradient(low_actor_loss,
                                            self.low_actor.trainable_variables)
            self.low_actor_optimizer.apply_gradients(
                zip(low_actor_grads, self.low_actor.trainable_variables))

            self.global_step.assign_add(1)
            return dict([['LOSS/low_actor_loss', low_actor_loss],
                         ['LOSS/low_critic_loss', low_critic_loss],
                         ['Statistics/low_q_min',
                          tf.reduce_min(q)],
                         ['Statistics/low_q_mean',
                          tf.reduce_mean(q)],
                         ['Statistics/low_q_max',
                          tf.reduce_max(q)]])

    @tf.function(experimental_relax_shapes=True)
    def train_high(self, memories):
        # s_ : [B, N]
        ss, r, aa, g, done, s_ = memories

        batchs = tf.shape(ss)[0]
        # ss, aa [B, T, *]
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                s = ss[:, 0]  # [B, N]
                true_end = (s_ - s)[:, self.fn_goal_dim:]
                g_dist = tfd.Normal(loc=true_end,
                                    scale=0.5 * self.high_scale[None, :])
                ss = tf.expand_dims(ss, 0)  # [1, B, T, *]
                ss = tf.tile(ss,
                             [self.sample_g_nums, 1, 1, 1])  # [10, B, T, *]
                ss = tf.reshape(ss, [-1, tf.shape(ss)[-1]])  # [10*B*T, *]
                aa = tf.expand_dims(aa, 0)  # [1, B, T, *]
                aa = tf.tile(aa,
                             [self.sample_g_nums, 1, 1, 1])  # [10, B, T, *]
                aa = tf.reshape(aa, [-1, tf.shape(aa)[-1]])  # [10*B*T, *]
                gs = tf.concat([
                    tf.expand_dims(g, 0),
                    tf.expand_dims(true_end, 0),
                    tf.clip_by_value(g_dist.sample(self.sample_g_nums - 2),
                                     -self.high_scale, self.high_scale)
                ],
                               axis=0)  # [10, B, N]

                all_g = gs + s[:, self.fn_goal_dim:]
                all_g = tf.expand_dims(all_g, 2)  # [10, B, 1, N]
                all_g = tf.tile(
                    all_g, [1, 1, self.sub_goal_steps, 1])  # [10, B, T, N]
                all_g = tf.reshape(all_g,
                                   [-1, tf.shape(all_g)[-1]])  # [10*B*T, N]
                all_g = all_g - ss[:, self.fn_goal_dim:]  # [10*B*T, N]
                feat = tf.concat([ss, all_g], axis=-1)  # [10*B*T, *]
                _aa = self.low_actor(feat)  # [10*B*T, A]
                if not self.is_continuous:
                    _aa = tf.one_hot(tf.argmax(_aa, axis=-1),
                                     self.a_dim,
                                     dtype=tf.float32)
                diff = _aa - aa
                diff = tf.reshape(
                    diff,
                    [self.sample_g_nums, batchs, self.sub_goal_steps, -1
                     ])  # [10, B, T, A]
                diff = tf.transpose(diff, [1, 0, 2, 3])  # [B, 10, T, A]
                logps = -0.5 * tf.reduce_sum(tf.norm(diff, ord=2, axis=-1)**2,
                                             axis=-1)  # [B, 10]
                idx = tf.argmax(logps, axis=-1, output_type=tf.int32)
                idx = tf.stack([tf.range(batchs), idx], axis=1)  # [B, 2]
                g = tf.gather_nd(tf.transpose(gs, [1, 0, 2]), idx)  # [B, N]

                q1, q2 = self.high_critic(s, g)
                q = tf.minimum(q1, q2)

                target_sub_goal = self.high_actor_target(s_) * self.high_scale
                target_sub_goal = tf.clip_by_value(
                    target_sub_goal + self.high_noise(), -self.high_scale,
                    self.high_scale)
                q_target = self.high_critic_target.get_min(s_, target_sub_goal)

                dc_r = tf.stop_gradient(r + self.gamma * (1 - done) * q_target)
                td_error1 = q1 - dc_r
                td_error2 = q2 - dc_r
                q1_loss = tf.reduce_mean(tf.square(td_error1))
                q2_loss = tf.reduce_mean(tf.square(td_error2))
                high_critic_loss = q1_loss + q2_loss

            high_critic_grads = tape.gradient(high_critic_loss,
                                              self.high_critic.weights)
            self.high_critic_optimizer.apply_gradients(
                zip(high_critic_grads, self.high_critic.weights))
            with tf.GradientTape() as tape:
                mu = self.high_actor(s) * self.high_scale
                q_actor = self.high_critic.Q1(s, mu)
                high_actor_loss = -tf.reduce_mean(q_actor)
            high_actor_grads = tape.gradient(
                high_actor_loss, self.high_actor.trainable_variables)
            self.high_actor_optimizer.apply_gradients(
                zip(high_actor_grads, self.high_actor.trainable_variables))
            return dict([['LOSS/high_actor_loss', high_actor_loss],
                         ['LOSS/high_critic_loss', high_critic_loss],
                         ['Statistics/high_q_min',
                          tf.reduce_min(q)],
                         ['Statistics/high_q_mean',
                          tf.reduce_mean(q)],
                         ['Statistics/high_q_max',
                          tf.reduce_max(q)]])

    def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done):
        assert isinstance(a,
                          np.ndarray), "store need action type is np.ndarray"
        assert isinstance(r,
                          np.ndarray), "store need reward type is np.ndarray"
        assert isinstance(done,
                          np.ndarray), "store need done type is np.ndarray"
        [o.append(_s) for o, _s in zip(self._high_s, s)]
        [o.append(_a) for o, _a in zip(self._high_a, a)]
        [o.append(_r) for o, _r in zip(self._high_r, r)]
        [o.append(_s_) for o, _s_ in zip(self._high_s_, s_)]
        [o.append(_d) for o, _d in zip(self._done, done)]
        [
            o.append(_subgoal)
            for o, _subgoal in zip(self._subgoals, self._noop_subgoal)
        ]

        ir = self.get_ir(s[:, self.fn_goal_dim:], self._noop_subgoal,
                         s_[:, self.fn_goal_dim:])
        # subgoal = s[:, self.fn_goal_dim:] + self._noop_subgoal - s_[:, self.fn_goal_dim:]
        subgoal = np.random.uniform(-self.high_scale,
                                    self.high_scale,
                                    size=(self.n_agents, self.sub_goal_dim))
        self.data_low.add(
            s,
            a,
            ir,
            s_,
            done[:, np.newaxis],  # 升维
            self._noop_subgoal,
            subgoal)
        self._noop_subgoal = subgoal

    def store_data(self, s, visual_s, a, r, s_, visual_s_, done):
        """
        for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer.
        """
        assert isinstance(a,
                          np.ndarray), "store need action type is np.ndarray"
        assert isinstance(r,
                          np.ndarray), "store need reward type is np.ndarray"
        assert isinstance(done,
                          np.ndarray), "store need done type is np.ndarray"
        [o.append(_s) for o, _s in zip(self._high_s, s)]
        [o.append(_a) for o, _a in zip(self._high_a, a)]
        [o.append(_r) for o, _r in zip(self._high_r, r)]
        [o.append(_s_) for o, _s_ in zip(self._high_s_, s_)]
        [o.append(_d) for o, _d in zip(self._done, done)]
        [
            o.append(_subgoal)
            for o, _subgoal in zip(self._subgoals, self._subgoal)
        ]

        ir = self.get_ir(s[:, self.fn_goal_dim:], self._subgoal,
                         s_[:, self.fn_goal_dim:])
        self._new_subgoal = np.where(
            self._c == 1,
            self.get_subgoal(s_).numpy(),
            s[:, self.fn_goal_dim:] + self._subgoal - s_[:, self.fn_goal_dim:])

        self.data_low.add(
            s,
            a,
            ir,
            s_,
            done[:, np.newaxis],  # 升维
            self._subgoal,
            self._new_subgoal)
        self._c = np.where(
            self._c == 1,
            np.full((self.n_agents, 1), self.sub_goal_steps, np.int32),
            self._c - 1)

    def get_transitions(self,
                        databuffer,
                        data_name_list=['s', 'a', 'r', 's_', 'done']):
        '''
        TODO: Annotation
        '''
        data = databuffer.sample()  # 经验池取数据
        if not self.is_continuous and 'a' in data_name_list:
            a_idx = data_name_list.index('a')
            a = data[a_idx].astype(np.int32)
            pre_shape = a.shape
            a = a.reshape(-1)
            a = sth.int2one_hot(a, self.a_dim)
            a = a.reshape(pre_shape + (-1, ))
            data[a_idx] = a
        return dict([[
            n, d
        ] for n, d in zip(data_name_list, list(map(self.data_convert, data)))])
Esempio n. 5
0
class MAXSQN(make_off_policy_class(mode='share')):
    '''
    https://github.com/createamind/DRL/blob/master/spinup/algos/maxsqn/maxsqn.py
    '''
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 alpha=0.2,
                 beta=0.1,
                 ployak=0.995,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 use_epsilon=False,
                 q_lr=5.0e-4,
                 alpha_lr=5.0e-4,
                 auto_adaption=True,
                 hidden_units=[32, 32],
                 **kwargs):
        assert not is_continuous, 'maxsqn only support discrete action space'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.use_epsilon = use_epsilon
        self.ployak = ployak
        self.log_alpha = alpha if not auto_adaption else tf.Variable(
            initial_value=0.0,
            name='log_alpha',
            dtype=tf.float32,
            trainable=True)
        self.auto_adaption = auto_adaption
        self.target_entropy = beta * np.log(self.a_dim)

        def _q_net():
            return rls.critic_q_all(self.feat_dim, self.a_dim, hidden_units)

        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv
        self.update_target_net_weights(self.critic_target_net.weights,
                                       self.critic_net.weights)
        self.q_lr, self.alpha_lr = map(self.init_lr, [q_lr, alpha_lr])
        self.optimizer_critic, self.optimizer_alpha = map(
            self.init_optimizer, [self.q_lr, self.alpha_lr])

        self.model_recorder(
            dict(critic_net=self.critic_net,
                 optimizer_critic=self.optimizer_critic,
                 optimizer_alpha=self.optimizer_alpha))

    def show_logo(self):
        self.recorder.logger.info('''
   xx     xx                                      xxxxxx         xxxxxx       xxxx   xx   
   xxx   xxx                                     xxx xxx        xxxx xxx      xxxx   xx   
   xxx   xxx        xxxxx          x   xx        xx             xx    xx      xxxxx  xx   
   xxxx  xxx       xxxxxx          xx xxx        xxxxxx         xx    xxx     xx xxx xx   
   xxxx xx x        x  xxx         xxxxx          xxxxxx       xx      xx     xx  xxxxx   
   xxxx xx x        xxxxxx          xxx               xxx      xxx  x xxx     xx   xxxx   
   xx xxx  x       xxx  xx          xxx          xx    xx       xx xxxxx      xx   xxxx   
   xx xxx  x       xx  xxx         xxxxx        xxxxxxxxx       xxx xxxx      xx    xxx   
   xx xxx  x       xxxxxxxx       xxx xxx        xxxxxxx         xxxxxxx      xx     xx   
                                                                  xxxxxxx                       
        ''')

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def choose_action(self, s, visual_s, evaluation=False):
        if self.use_epsilon and np.random.uniform(
        ) < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation):
            a = np.random.randint(0, self.a_dim, self.n_agents)
        else:
            mu, pi, self.cell_state = self._get_action(s, visual_s,
                                                       self.cell_state)
            a = pi.numpy()
        return a

    @tf.function
    def _get_action(self, s, visual_s, cell_state):
        with tf.device(self.device):
            feat, cell_state = self.get_feature(s,
                                                visual_s,
                                                cell_state=cell_state,
                                                record_cs=True)
            q = self.critic_net.Q1(feat)
            cate_dist = tfp.distributions.Categorical(logits=q / self.alpha)
            pi = cate_dist.sample()
        return tf.argmax(q, axis=1), pi, cell_state

    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')
        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'train_function':
                    self.train,
                    'update_function':
                    lambda: self.update_target_net_weights(
                        self.critic_target_net.weights, self.critic_net.
                        weights, self.ployak),
                    'summary_dict':
                    dict([['LEARNING_RATE/q_lr',
                           self.q_lr(self.train_step)],
                          [
                              'LEARNING_RATE/alpha_lr',
                              self.alpha_lr(self.train_step)
                          ]])
                })

    @tf.function(experimental_relax_shapes=True)
    def train(self, memories, isw, crsty_loss, cell_state):
        ss, vvss, a, r, done = memories
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                feat, feat_ = self.get_feature(ss,
                                               vvss,
                                               cell_state=cell_state,
                                               s_and_s_=True)
                q1, q2 = self.critic_net(feat)
                q1_eval = tf.reduce_sum(tf.multiply(q1, a),
                                        axis=1,
                                        keepdims=True)
                q2_eval = tf.reduce_sum(tf.multiply(q2, a),
                                        axis=1,
                                        keepdims=True)

                q1_target, q2_target = self.critic_target_net(feat_)
                q1_target_max = tf.reduce_max(q1_target, axis=1, keepdims=True)
                q1_target_log_probs = tf.nn.log_softmax(q1_target / self.alpha,
                                                        axis=1) + 1e-8
                q1_target_entropy = -tf.reduce_mean(
                    tf.reduce_sum(
                        tf.exp(q1_target_log_probs) * q1_target_log_probs,
                        axis=1,
                        keepdims=True))

                q2_target_max = tf.reduce_max(q2_target, axis=1, keepdims=True)
                # q2_target_log_probs = tf.nn.log_softmax(q2_target, axis=1)
                # q2_target_log_max = tf.reduce_max(q2_target_log_probs, axis=1, keepdims=True)

                q_target = tf.minimum(
                    q1_target_max,
                    q2_target_max) + self.alpha * q1_target_entropy
                dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done))
                td_error1 = q1_eval - dc_r
                td_error2 = q2_eval - dc_r
                q1_loss = tf.reduce_mean(tf.square(td_error1) * isw)
                q2_loss = tf.reduce_mean(tf.square(td_error2) * isw)
                loss = 0.5 * (q1_loss + q2_loss) + crsty_loss
            loss_grads = tape.gradient(loss, self.critic_tv)
            self.optimizer_critic.apply_gradients(
                zip(loss_grads, self.critic_tv))
            if self.auto_adaption:
                with tf.GradientTape() as tape:
                    q1 = self.critic_net.Q1(feat)
                    q1_log_probs = tf.nn.log_softmax(q1 / self.alpha,
                                                     axis=1) + 1e-8
                    q1_entropy = -tf.reduce_mean(
                        tf.reduce_sum(tf.exp(q1_log_probs) * q1_log_probs,
                                      axis=1,
                                      keepdims=True))
                    alpha_loss = -tf.reduce_mean(
                        self.alpha *
                        tf.stop_gradient(self.target_entropy - q1_entropy))
                alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
                self.optimizer_alpha.apply_gradients([(alpha_grad,
                                                       self.log_alpha)])
            self.global_step.assign_add(1)
            summaries = dict(
                [['LOSS/loss', loss], ['Statistics/log_alpha', self.log_alpha],
                 ['Statistics/alpha', self.alpha],
                 ['Statistics/q1_entropy', q1_entropy],
                 ['Statistics/q_min',
                  tf.reduce_mean(tf.minimum(q1, q2))],
                 ['Statistics/q_mean', tf.reduce_mean(q1)],
                 ['Statistics/q_max',
                  tf.reduce_mean(tf.maximum(q1, q2))]])
            if self.auto_adaption:
                summaries.update({'LOSS/alpha_loss': alpha_loss})
            return (td_error1 + td_error2) / 2, summaries