class MATD3(Policy): def __init__(self, s_dim, a_dim, is_continuous, ployak=0.995, actor_lr=5.0e-4, critic_lr=1.0e-3, n=1, i=0, hidden_units={ 'actor': [32, 32], 'q': [32, 32] }, **kwargs): assert is_continuous, 'matd3 only support continuous action space' raise Exception('MA系列存在问题,还未修复') super().__init__(s_dim=s_dim, visual_sources=0, visual_resolution=0, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.n = n self.i = i self.ployak = ployak # self.action_noise = rls.NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim)) self.action_noise = rls.OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim)) def _actor_net(): return rls.actor_dpg(self.s_dim, 0, self.a_dim, hidden_units['actor']) self.actor_net = _actor_net() self.actor_target_net = _actor_net() def _q_net(): return rls.critic_q_one((self.s_dim) * self.n, 0, (self.a_dim) * self.n, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.update_target_net_weights( self.actor_target_net.weights + self.critic_target_net.weights, self.actor_net.weights + self.critic_net.weights) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder( dict(actor=self.actor_net, critic_net=self.critic_net, optimizer_critic=self.optimizer_critic, optimizer_actor=self.optimizer_actor)) self.recorder.logger.info(self.action_noise) def show_logo(self): self.recorder.logger.info(''' xxxx xxx xx xxxxxxxxx xxxxxxx xxxxx xxx xx xxx xx x xx x xxx xx xx xxx xxx xxx xx x xx x xx xx xx xxx xxx x xx x x xx xxx xxxx x x xx xx x x xxx xxxx x xxxx x xxxxxx x x xx xxx x xxx x xx xx x x xx xx xx x xx x xx xx x x xxx xx xxx xxxx xxxxxx xxx xxxxx xxxxx xxxxxxx xxxxx ''') def choose_action(self, s, evaluation=False): return self._get_action(s, evaluation).numpy() def get_target_action(self, s): return self._get_target_action(s).numpy() @tf.function def _get_action(self, vector_input, evaluation): vector_input = self.cast(vector_input) with tf.device(self.device): mu = self.actor_net(vector_input) if evaluation == True: return mu else: return tf.clip_by_value(mu + self.action_noise(), -1, 1) @tf.function def _get_target_action(self, vector_input): vector_input = self.cast(vector_input) with tf.device(self.device): target_mu = self.actor_target_net(vector_input) return tf.clip_by_value(target_mu + self.action_noise(), -1, 1) def learn(self, episode, ap, al, ss, ss_, aa, aa_, s, r): ap, al, ss, ss_, aa, aa_, s, r = map(self.data_convert, (ap, al, ss, ss_, aa, aa_, s, r)) summaries = self.train(ap, al, ss, ss_, aa, aa_, s, r) self.update_target_net_weights( self.actor_target_net.weights + self.critic_target_net.weights, self.actor_net.weights + self.critic_net.weights, self.ployak) summaries.update( dict([['LEARNING_RATE/actor_lr', self.actor_lr(self.train_step)], ['LEARNING_RATE/critic_lr', self.critic_lr(self.train_step)]])) self.write_training_summaries(self.global_step, summaries) @tf.function(experimental_relax_shapes=True) def train(self, q_actor_a_previous, q_actor_a_later, ss, ss_, aa, aa_, s, r): with tf.device(self.device): for _ in range(2): with tf.GradientTape() as tape: q1, q2 = self.critic_net(ss, aa) q_target = self.critic_target_net.get_min(ss_, aa_) dc_r = tf.stop_gradient(r + self.gamma * q_target) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1)) q2_loss = tf.reduce_mean(tf.square(td_error2)) critic_loss = 0.5 * (q1_loss + q2_loss) critic_grads = tape.gradient( critic_loss, self.critic_net.trainable_variables) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_net.trainable_variables)) with tf.GradientTape() as tape: mu = self.actor_net(s) mumu = tf.concat((q_actor_a_previous, mu, q_actor_a_later), axis=1) q1_actor = self.critic_net.Q1(ss, mumu) actor_loss = -tf.reduce_mean(q1_actor) actor_grads = tape.gradient(actor_loss, self.actor_net.trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_net.trainable_variables)) self.global_step.assign_add(1) return dict([['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss]]) @tf.function(experimental_relax_shapes=True) def train_persistent(self, q_actor_a_previous, q_actor_a_later, ss, ss_, aa, aa_, s, r): with tf.device(self.device): for _ in range(2): with tf.GradientTape(persistent=True) as tape: mu = self.actor_net(s) mumu = tf.concat((q_actor_a_previous, mu, q_actor_a_later), axis=1) q1, q2 = self.critic_net(ss, aa) q_target = self.critic_target_net.get_min(ss_, aa_) q1_actor = self.critic_net.Q1(ss, mumu) dc_r = tf.stop_gradient(r + self.gamma * q_target) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1)) q2_loss = tf.reduce_mean(tf.square(td_error2)) critic_loss = 0.5 * (q1_loss + q2_loss) actor_loss = -tf.reduce_mean(q1_actor) critic_grads = tape.gradient( critic_loss, self.critic_net.trainable_variables) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_net.trainable_variables)) actor_grads = tape.gradient(actor_loss, self.actor_net.trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_net.trainable_variables)) self.global_step.assign_add(1) return dict([['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss]])
class TD3(make_off_policy_class(mode='share')): ''' Twin Delayed Deep Deterministic Policy Gradient, https://arxiv.org/abs/1802.09477 ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, delay_num=2, noise_type='gaussian', gaussian_noise_sigma=0.2, gaussian_noise_bound=0.2, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak self.delay_num = delay_num self.discrete_tau = discrete_tau self.gaussian_noise_sigma = gaussian_noise_sigma self.gaussian_noise_bound = gaussian_noise_bound if self.is_continuous: def _actor_net(): return rls.actor_dpg(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) if noise_type == 'gaussian': self.action_noise = rls.ClippedNormalActionNoise( mu=np.zeros(self.a_dim), sigma=self.gaussian_noise_sigma * np.ones(self.a_dim), bound=self.gaussian_noise_bound) elif noise_type == 'ou': self.action_noise = rls.OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim)) else: def _actor_net(): return rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_net = _actor_net() self.actor_target_net = _actor_net() self.actor_tv = self.actor_net.trainable_variables def _q_net(): return rls.critic_q_one(self.feat_dim, self.a_dim, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.update_target_net_weights( self.actor_target_net.weights + self.critic_target_net.weights, self.actor_net.weights + self.critic_net.weights) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder( dict(actor=self.actor_net, critic_net=self.critic_net, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic)) def show_logo(self): self.recorder.logger.info(''' xxxxxxxxx xxxxxxx xxxxx xx x xx x xxx xx xx xx x xx x xx xx xx x x xx xxx x x xxx xxxx x x xx xxx x x xx xx xx x x xxx xx xxx xxxxx xxxxxxx xxxxx ''') def choose_action(self, s, visual_s, evaluation=False): mu, pi, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = mu.numpy() if evaluation else pi.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) if self.is_continuous: mu = self.actor_net(feat) pi = tf.clip_by_value(mu + self.action_noise(), -1, 1) else: logits = self.actor_net(feat) mu = tf.argmax(logits, axis=1) cate_dist = tfp.distributions.Categorical(logits) pi = cate_dist.sample() return mu, pi, cell_state def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(kwargs['step']): self._learn( function_dict={ 'train_function': self.train, 'update_function': lambda: self.update_target_net_weights( self.actor_target_net.weights + self.critic_target_net. weights, self.actor_net.weights + self.critic_net. weights, self.ployak), 'summary_dict': dict([[ 'LEARNING_RATE/actor_lr', self.actor_lr(self.train_step) ], [ 'LEARNING_RATE/critic_lr', self.critic_lr(self.train_step) ]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): for _ in range(self.delay_num): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: target_mu = self.actor_target_net(feat_) action_target = tf.clip_by_value( target_mu + self.action_noise(), -1, 1) else: target_logits = self.actor_target_net(feat_) logp_all = tf.nn.log_softmax(target_logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) action_target = _pi_diff + _pi q1, q2 = self.critic_net(feat, a) q_target = self.critic_target_net.get_min( feat_, action_target) dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done)) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * (q1_loss + q2_loss) + crsty_loss critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) with tf.GradientTape() as tape: if self.is_continuous: mu = self.actor_net(feat) else: logits = self.actor_net(feat) _pi = tf.nn.softmax(logits) _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1), self.a_dim, dtype=tf.float32) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) mu = _pi_diff + _pi q1_actor = self.critic_net.Q1(feat, mu) actor_loss = -tf.reduce_mean(q1_actor) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) self.global_step.assign_add(1) return (td_error1 + td_error2) / 2, dict([ ['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))], ]) @tf.function(experimental_relax_shapes=True) def train_persistent(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): for _ in range(2): with tf.GradientTape(persistent=True) as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: target_mu = self.actor_target_net(feat_) action_target = tf.clip_by_value( target_mu + self.action_noise(), -1, 1) mu = self.actor_net(feat) else: target_logits = self.actor_target_net(feat_) logp_all = tf.nn.log_softmax(target_logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) action_target = _pi_diff + _pi logits = self.actor_net(feat) _pi = tf.nn.softmax(logits) _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1), self.a_dim, dtype=tf.float32) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) mu = _pi_diff + _pi q1, q2 = self.critic_net(feat, a) q_target = self.critic_target_net.get_min( feat_, action_target) q1_actor = self.critic_net.Q1(feat, mu) dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done)) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * (q1_loss + q2_loss) + crsty_loss actor_loss = -tf.reduce_mean(q1_actor) critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) self.global_step.assign_add(1) return (td_error1 + td_error2) / 2, dict( [['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))]])
class SAC_V(make_off_policy_class(mode='share')): """ Soft Actor Critic with Value neural network. https://arxiv.org/abs/1812.05905 Soft Actor-Critic for Discrete Action Settings. https://arxiv.org/abs/1910.07207 """ def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, annealing=True, last_alpha=0.01, ployak=0.995, use_gumbel=True, discrete_tau=1.0, log_std_bound=[-20, 2], hidden_units={ 'actor_continuous': { 'share': [128, 128], 'mu': [64], 'log_std': [64] }, 'actor_discrete': [64, 32], 'q': [128, 128], 'v': [128, 128] }, actor_lr=5.0e-4, critic_lr=1.0e-3, alpha_lr=5.0e-4, auto_adaption=True, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak self.use_gumbel = use_gumbel self.discrete_tau = discrete_tau self.log_std_min, self.log_std_max = log_std_bound[:] self.auto_adaption = auto_adaption self.annealing = annealing if self.auto_adaption: self.log_alpha = tf.Variable(initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) else: self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha), name='log_alpha', dtype=tf.float32, trainable=False) if self.annealing: self.alpha_annealing = LinearAnnealing(alpha, last_alpha, 1e6) if self.is_continuous: self.actor_net = rls.actor_continuous( self.feat_dim, self.a_dim, hidden_units['actor_continuous']) else: self.actor_net = rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) if self.use_gumbel: self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_tv = self.actor_net.trainable_variables # entropy = -log(1/|A|) = log |A| self.target_entropy = 0.98 * (self.a_dim if self.is_continuous else np.log(self.a_dim)) if self.is_continuous or self.use_gumbel: critic_net = rls.critic_q_one else: critic_net = rls.critic_q_all _q_net = lambda: critic_net(self.feat_dim, self.a_dim, hidden_units['q' ]) _v_net = lambda: rls.critic_v(self.feat_dim, hidden_units['v']) self.q_net = DoubleQ(_q_net) self.v_net = _v_net() self.v_target_net = _v_net() self.critic_tv = self.q_net.trainable_variables + self.v_net.trainable_variables + self.other_tv self.update_target_net_weights(self.v_target_net.weights, self.v_net.weights) self.actor_lr, self.critic_lr, self.alpha_lr = map( self.init_lr, [actor_lr, critic_lr, alpha_lr]) self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha = map( self.init_optimizer, [self.actor_lr, self.critic_lr, self.alpha_lr]) self.model_recorder( dict( actor=self.actor_net, q_net=self.q_net, v_net=self.v_net, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha, )) def show_logo(self): self.recorder.logger.info(''' xxxxxxx xx xxxxxx xxxx xxx xx xx xxx xxx xx xxx x xx x xxx xx xx xx xx xxxx x xx xx xx xx xx xxx xx xxxxxx xx xx xxx xx xx xx xx xx xxx xxxxxx xxx xx xx xx xxxxx x xx xx xx xx xx xxx xx xx xx xx xxx xxx xxx xxxxxxx xxx xxxxx xxxxxx x x ''') @property def alpha(self): return tf.exp(self.log_alpha) def choose_action(self, s, visual_s, evaluation=False): mu, pi, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = mu.numpy() if evaluation else pi.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, _ = squash_rsample(mu, log_std) mu = tf.tanh(mu) # squash mu else: logits = self.actor_net(feat) mu = tf.argmax(logits, axis=1) cate_dist = tfp.distributions.Categorical(logits) pi = cate_dist.sample() return mu, pi, cell_state def learn(self, **kwargs): self.episode = kwargs['episode'] def _train(memories, isw, crsty_loss, cell_state): if self.is_continuous or self.use_gumbel: td_error, summaries = self.train(memories, isw, crsty_loss, cell_state) else: td_error, summaries = self.train_discrete( memories, isw, crsty_loss, cell_state) if self.annealing and not self.auto_adaption: self.log_alpha.assign( tf.math.log( tf.cast(self.alpha_annealing(self.global_step.numpy()), tf.float32))) return td_error, summaries for i in range(kwargs['step']): self._learn( function_dict={ 'train_function': _train, 'update_function': lambda: self.update_target_net_weights( self.v_target_net.weights, self.v_net.weights, self. ployak), 'summary_dict': dict([[ 'LEARNING_RATE/actor_lr', self.actor_lr(self.episode) ], [ 'LEARNING_RATE/critic_lr', self.critic_lr(self.episode) ], ['LEARNING_RATE/alpha_lr', self.alpha_lr(self.episode)]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = squash_rsample(mu, log_std) else: logits = self.actor_net(feat) cate_dist = tfp.distributions.Categorical(logits) pi = cate_dist.sample() log_pi = cate_dist.log_prob(pi) pi = tf.one_hot(pi, self.a_dim, dtype=tf.float32) q1, q2 = self.q_net(feat, a) q_pi = self.q_net.get_min(feat, pi) v = self.v_net(feat) v_target = self.v_target_net(feat_) dc_r = tf.stop_gradient(r + self.gamma * v_target * (1 - done)) v_from_q_stop = tf.stop_gradient(q_pi - self.alpha * log_pi) td_v = v - v_from_q_stop td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) v_loss_stop = tf.reduce_mean(tf.square(td_v) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + 0.5 * v_loss_stop + crsty_loss critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = squash_rsample(mu, log_std) entropy = gaussian_entropy(log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) pi = _pi_diff + _pi log_pi = tf.reduce_sum(tf.multiply(logp_all, pi), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) q1_pi = self.q_net.Q1(feat, pi) actor_loss = -tf.reduce_mean(q1_pi - self.alpha * log_pi) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) if self.auto_adaption: with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) # pi, log_pi = squash_rsample(mu, log_std) norm_dist = tfp.distributions.Normal( loc=mu, scale=tf.exp(log_std)) log_pi = tf.reduce_sum(norm_dist.log_prob( norm_dist.sample()), axis=-1) else: logits = self.actor_net(feat) cate_dist = tfp.distributions.Categorical(logits) log_pi = cate_dist.log_prob(cate_dist.sample()) alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(log_pi - self.target_entropy)) alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients([(alpha_grad, self.log_alpha)]) self.global_step.assign_add(1) summaries = dict( [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/v_loss', v_loss_stop], ['LOSS/critic_loss', critic_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', entropy], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))], ['Statistics/v_mean', tf.reduce_mean(v)]]) if self.auto_adaption: summaries.update({'LOSS/alpha_loss': alpha_loss}) return (td_error1 + td_error2) / 2, summaries @tf.function(experimental_relax_shapes=True) def train_persistent(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = squash_rsample(mu, log_std) entropy = gaussian_entropy(log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) pi = _pi_diff + _pi log_pi = tf.reduce_sum(tf.multiply(logp_all, pi), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) q1, q2 = self.q_net(feat, a) v = self.v_net(feat) q1_pi, q2_pi = self.q_net(feat, pi) v_target = self.v_target_net(feat_) dc_r = tf.stop_gradient(r + self.gamma * v_target * (1 - done)) v_from_q_stop = tf.stop_gradient( tf.minimum(q1_pi, q2_pi) - self.alpha * log_pi) td_v = v - v_from_q_stop td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) v_loss_stop = tf.reduce_mean(tf.square(td_v) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + 0.5 * v_loss_stop + crsty_loss actor_loss = -tf.reduce_mean(q1_pi - self.alpha * log_pi) if self.auto_adaption: alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(log_pi - self.target_entropy)) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) if self.auto_adaption: alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients([(alpha_grad, self.log_alpha)]) self.global_step.assign_add(1) summaries = dict( [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/v_loss', v_loss_stop], ['LOSS/critic_loss', critic_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', entropy], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))], ['Statistics/v_mean', tf.reduce_mean(v)]]) if self.auto_adaption: summaries.update({'LOSS/alpha_loss': alpha_loss}) return (td_error1 + td_error2) / 2, summaries @tf.function(experimental_relax_shapes=True) def train_discrete(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) q1_all, q2_all = self.q_net(feat) # [B, A] q_function = lambda x: tf.reduce_sum( x * a, axis=-1, keepdims=True) #[B, 1] q1 = q_function(q1_all) q2 = q_function(q2_all) logits = self.actor_net(feat) #[B, A] logp_all = tf.nn.log_softmax(logits) #[B, A] v = self.v_net(feat) # [B, 1] v_target = self.v_target_net(feat_) # [B, 1] dc_r = tf.stop_gradient(r + self.gamma * v_target * (1 - done)) td_v = v - tf.stop_gradient( tf.minimum( tf.reduce_sum(tf.exp(logp_all) * q1_all, axis=-1), tf.reduce_sum(tf.exp(logp_all) * q2_all, axis=-1))) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) v_loss_stop = tf.reduce_mean(tf.square(td_v) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + 0.5 * v_loss_stop + crsty_loss critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) with tf.GradientTape() as tape: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) entropy = -tf.reduce_mean( tf.reduce_sum( tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) q_all = self.q_net.get_min(feat) # [B, A] actor_loss = -tf.reduce_mean( tf.reduce_sum((q_all - self.alpha * logp_all) * tf.exp(logp_all)) # [B, A] => [B,] ) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) if self.auto_adaption: with tf.GradientTape() as tape: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) entropy = -tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True) # [B, 1] corr = tf.stop_gradient(-entropy - self.target_entropy) # corr = tf.stop_gradient(tf.reduce_sum((logp_all - self.a_dim) * tf.exp(logp_all), axis=-1)) #[B, A] => [B,] alpha_loss = -tf.reduce_mean(self.alpha * corr) alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients([(alpha_grad, self.log_alpha)]) self.global_step.assign_add(1) summaries = dict([['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/v_loss', v_loss_stop], ['LOSS/critic_loss', critic_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', tf.reduce_mean(entropy)], ['Statistics/v_mean', tf.reduce_mean(v)]]) if self.auto_adaption: summaries.update({'LOSS/alpha_loss': alpha_loss}) return (td_error1 + td_error2) / 2, summaries
class HIRO(make_off_policy_class(mode='no_share')): ''' Data-Efficient Hierarchical Reinforcement Learning, http://arxiv.org/abs/1805.08296 ''' def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, high_scale=1.0, reward_scale=1.0, sample_g_nums=100, sub_goal_steps=10, fn_goal_dim=0, intrinsic_reward_mode='os', high_batch_size=256, high_buffer_size=100000, low_batch_size=8, low_buffer_size=10000, high_actor_lr=1.0e-4, high_critic_lr=1.0e-3, low_actor_lr=1.0e-4, low_critic_lr=1.0e-3, hidden_units={ 'high_actor': [64, 64], 'high_critic': [64, 64], 'low_actor': [64, 64], 'low_critic': [64, 64] }, **kwargs): assert visual_sources == 0, 'HIRO doesn\'t support visual inputs.' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.data_high = ExperienceReplay(high_batch_size, high_buffer_size) self.data_low = ExperienceReplay(low_batch_size, low_buffer_size) self.ployak = ployak self.high_scale = np.array( high_scale if isinstance(high_scale, list) else [high_scale] * self.s_dim, dtype=np.float32) self.reward_scale = reward_scale self.fn_goal_dim = fn_goal_dim self.sample_g_nums = sample_g_nums self.sub_goal_steps = sub_goal_steps self.sub_goal_dim = self.s_dim - self.fn_goal_dim self.high_noise = rls.ClippedNormalActionNoise( mu=np.zeros(self.sub_goal_dim), sigma=self.high_scale * np.ones(self.sub_goal_dim), bound=self.high_scale / 2) self.low_noise = rls.ClippedNormalActionNoise(mu=np.zeros(self.a_dim), sigma=1.0 * np.ones(self.a_dim), bound=0.5) _high_actor_net = lambda: rls.actor_dpg(self.s_dim, self.sub_goal_dim, hidden_units['high_actor']) if self.is_continuous: _low_actor_net = lambda: rls.actor_dpg( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_actor']) else: _low_actor_net = lambda: rls.actor_discrete( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_actor']) self.gumbel_dist = tfd.Gumbel(0, 1) self.high_actor = _high_actor_net() self.high_actor_target = _high_actor_net() self.low_actor = _low_actor_net() self.low_actor_target = _low_actor_net() _high_critic_net = lambda: rls.critic_q_one( self.s_dim, self.sub_goal_dim, hidden_units['high_critic']) _low_critic_net = lambda: rls.critic_q_one( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_critic']) self.high_critic = DoubleQ(_high_critic_net) self.high_critic_target = DoubleQ(_high_critic_net) self.low_critic = DoubleQ(_low_critic_net) self.low_critic_target = DoubleQ(_low_critic_net) self.update_target_net_weights( self.low_actor_target.weights + self.low_critic_target.weights + self.high_actor_target.weights + self.high_critic_target.weights, self.low_actor.weights + self.low_critic.weights + self.high_actor.weights + self.high_critic.weights) self.low_actor_lr, self.low_critic_lr = map( self.init_lr, [low_actor_lr, low_critic_lr]) self.high_actor_lr, self.high_critic_lr = map( self.init_lr, [high_actor_lr, high_critic_lr]) self.low_actor_optimizer, self.low_critic_optimizer = map( self.init_optimizer, [self.low_actor_lr, self.low_critic_lr]) self.high_actor_optimizer, self.high_critic_optimizer = map( self.init_optimizer, [self.high_actor_lr, self.high_critic_lr]) self.model_recorder( dict(high_actor=self.high_actor, high_critic=self.high_critic, low_actor=self.low_actor, low_critic=self.low_critic, low_actor_optimizer=self.low_actor_optimizer, low_critic_optimizer=self.low_critic_optimizer, high_actor_optimizer=self.high_actor_optimizer, high_critic_optimizer=self.high_critic_optimizer)) self.counts = 0 self._high_s = [[] for _ in range(self.n_agents)] self._noop_subgoal = np.random.uniform(-self.high_scale, self.high_scale, size=(self.n_agents, self.sub_goal_dim)) self.get_ir = self.generate_ir_func(mode=intrinsic_reward_mode) def generate_ir_func(self, mode='os'): if mode == 'os': return lambda last_feat, subgoal, feat: -tf.norm( last_feat + subgoal - feat, ord=2, axis=-1, keepdims=True) elif mode == 'cos': return lambda last_feat, subgoal, feat: tf.expand_dims( -tf.keras.losses.cosine_similarity( tf.cast(feat - last_feat, tf.float32), tf.cast(subgoal, tf.float32), axis=-1), axis=-1) def show_logo(self): self.recorder.logger.info(''' xxxxx xxxxx xxxx xxxxxxx xxxxxx xx xx xx xxxxxxx xxx xxxx xx xx xx xx xxx xxx xxx xx xx xx xx xxx xx xxx xxxxxxx xx xxxxxx xx xxx xx xx xx xxxxxx xx xxx xx xx xx xx xxxx xx xxx xx xx xx xx xxx xxx xxx xxxxx xxxxx xxxx xxxxx xxxx xxxxxxx ''') def store_high_buffer(self, i): eps_len = len(self._high_s[i]) intervals = list(range(0, eps_len, self.sub_goal_steps)) if len(intervals) < 1: return left = intervals[:-1] right = intervals[1:] s, r, a, g, d, s_ = [], [], [], [], [], [] for _l, _r in zip(left, right): s.append(self._high_s[i][_l:_r]) r.append(sum(self._high_r[i][_l:_r]) * self.reward_scale) a.append(self._high_a[i][_l:_r]) g.append(self._subgoals[i][_l]) d.append(self._done[i][_r - 1]) s_.append(self._high_s_[i][_r - 1]) right = intervals[-1] s.append(self._high_s[i][right:eps_len] + [self._high_s[i][-1]] * (self.sub_goal_steps + right - eps_len)) r.append(sum(self._high_r[i][right:eps_len])) a.append(self._high_a[i][right:eps_len] + [self._high_a[i][-1]] * (self.sub_goal_steps + right - eps_len)) g.append(self._subgoals[i][right]) d.append(self._done[i][-1]) s_.append(self._high_s_[i][-1]) self.data_high.add(np.array(s), np.array(r)[:, np.newaxis], np.array(a), np.array(g), np.array(d)[:, np.newaxis], np.array(s_)) def reset(self): self._c = np.full((self.n_agents, 1), self.sub_goal_steps, np.int32) for i in range(self.n_agents): self.store_high_buffer(i) self._high_r = [[] for _ in range(self.n_agents)] self._high_a = [[] for _ in range(self.n_agents)] self._high_s = [[] for _ in range(self.n_agents)] self._subgoals = [[] for _ in range(self.n_agents)] self._done = [[] for _ in range(self.n_agents)] self._high_s_ = [[] for _ in range(self.n_agents)] self._new_subgoal = np.zeros((self.n_agents, self.sub_goal_dim), dtype=np.float32) def partial_reset(self, done): self._c = np.where( done[:, np.newaxis], np.full((self.n_agents, 1), self.sub_goal_steps, np.int32), self._c) idx = np.where(done)[0] for i in idx: self.store_high_buffer(i) self._high_s[i] = [] self._high_a[i] = [] self._high_s_[i] = [] self._high_r[i] = [] self._done[i] = [] self._subgoals[i] = [] @tf.function def _get_action(self, s, visual_s, subgoal): with tf.device(self.device): feat = tf.concat([s, subgoal], axis=-1) if self.is_continuous: mu = self.low_actor(feat) pi = tf.clip_by_value(mu + self.low_noise(), -1, 1) else: logits = self.low_actor(feat) mu = tf.argmax(logits, axis=1) cate_dist = tfd.Categorical(logits) pi = cate_dist.sample() return mu, pi def choose_action(self, s, visual_s, evaluation=False): self._subgoal = np.where(self._c == self.sub_goal_steps, self.get_subgoal(s).numpy(), self._new_subgoal) mu, pi = self._get_action(s, visual_s, self._subgoal) a = mu.numpy() if evaluation else pi.numpy() return a @tf.function def get_subgoal(self, s): ''' last_s 上一个隐状态 subgoal 上一个子目标 s 当前隐状态 ''' new_subgoal = self.high_scale * self.high_actor(s) new_subgoal = tf.clip_by_value(new_subgoal + self.high_noise(), -self.high_scale, self.high_scale) return new_subgoal def learn(self, **kwargs): self.episode = kwargs['episode'] for i in range(kwargs['step']): if self.data_low.is_lg_batch_size and self.data_high.is_lg_batch_size: self.intermediate_variable_reset() low_data = self.get_transitions( self.data_low, data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_']) high_data = self.get_transitions( self.data_high, data_name_list=['s', 'r', 'a', 'g', 'done', 's_']) # --------------------------------------获取需要传给train函数的参数 _low_training_data = self.get_value_from_dict( data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_'], data_dict=low_data) _high_training_data = self.get_value_from_dict( data_name_list=['s', 'r', 'a', 'g', 'done', 's_'], data_dict=high_data) summaries = self.train_low(_low_training_data) self.summaries.update(summaries) self.update_target_net_weights( self.low_actor_target.weights + self.low_critic_target.weights, self.low_actor.weights + self.low_critic.weights, self.ployak) if self.counts % self.sub_goal_steps == 0: self.counts = 0 high_summaries = self.train_high(_high_training_data) self.summaries.update(high_summaries) self.update_target_net_weights( self.high_actor_target.weights + self.high_critic_target.weights, self.high_actor.weights + self.high_critic.weights, self.ployak) self.counts += 1 self.summaries.update( dict([[ 'LEARNING_RATE/low_actor_lr', self.low_actor_lr(self.episode) ], [ 'LEARNING_RATE/low_critic_lr', self.low_critic_lr(self.episode) ], [ 'LEARNING_RATE/high_actor_lr', self.high_actor_lr(self.episode) ], [ 'LEARNING_RATE/high_critic_lr', self.high_critic_lr(self.episode) ]])) self.write_training_summaries(self.global_step, self.summaries) @tf.function(experimental_relax_shapes=True) def train_low(self, memories): s, a, r, s_, done, g, g_ = memories with tf.device(self.device): with tf.GradientTape() as tape: feat = tf.concat([s, g], axis=-1) feat_ = tf.concat([s_, g_], axis=-1) if self.is_continuous: target_mu = self.low_actor_target(feat_) action_target = tf.clip_by_value( target_mu + self.low_noise(), -1, 1) else: target_logits = self.low_actor_target(feat_) logp_all = tf.nn.log_softmax(target_logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [tf.shape(feat_)[0], self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax((logp_all + gumbel_noise) / 1.) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) action_target = _pi_diff + _pi q1, q2 = self.low_critic(feat, a) q = tf.minimum(q1, q2) q_target = self.low_critic_target.get_min(feat_, action_target) dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done)) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1)) q2_loss = tf.reduce_mean(tf.square(td_error2)) low_critic_loss = q1_loss + q2_loss low_critic_grads = tape.gradient(low_critic_loss, self.low_critic.weights) self.low_critic_optimizer.apply_gradients( zip(low_critic_grads, self.low_critic.weights)) with tf.GradientTape() as tape: if self.is_continuous: mu = self.low_actor(feat) else: logits = self.low_actor(feat) _pi = tf.nn.softmax(logits) _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1), self.a_dim, dtype=tf.float32) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) mu = _pi_diff + _pi q_actor = self.low_critic.Q1(feat, mu) low_actor_loss = -tf.reduce_mean(q_actor) low_actor_grads = tape.gradient(low_actor_loss, self.low_actor.trainable_variables) self.low_actor_optimizer.apply_gradients( zip(low_actor_grads, self.low_actor.trainable_variables)) self.global_step.assign_add(1) return dict([['LOSS/low_actor_loss', low_actor_loss], ['LOSS/low_critic_loss', low_critic_loss], ['Statistics/low_q_min', tf.reduce_min(q)], ['Statistics/low_q_mean', tf.reduce_mean(q)], ['Statistics/low_q_max', tf.reduce_max(q)]]) @tf.function(experimental_relax_shapes=True) def train_high(self, memories): # s_ : [B, N] ss, r, aa, g, done, s_ = memories batchs = tf.shape(ss)[0] # ss, aa [B, T, *] with tf.device(self.device): with tf.GradientTape() as tape: s = ss[:, 0] # [B, N] true_end = (s_ - s)[:, self.fn_goal_dim:] g_dist = tfd.Normal(loc=true_end, scale=0.5 * self.high_scale[None, :]) ss = tf.expand_dims(ss, 0) # [1, B, T, *] ss = tf.tile(ss, [self.sample_g_nums, 1, 1, 1]) # [10, B, T, *] ss = tf.reshape(ss, [-1, tf.shape(ss)[-1]]) # [10*B*T, *] aa = tf.expand_dims(aa, 0) # [1, B, T, *] aa = tf.tile(aa, [self.sample_g_nums, 1, 1, 1]) # [10, B, T, *] aa = tf.reshape(aa, [-1, tf.shape(aa)[-1]]) # [10*B*T, *] gs = tf.concat([ tf.expand_dims(g, 0), tf.expand_dims(true_end, 0), tf.clip_by_value(g_dist.sample(self.sample_g_nums - 2), -self.high_scale, self.high_scale) ], axis=0) # [10, B, N] all_g = gs + s[:, self.fn_goal_dim:] all_g = tf.expand_dims(all_g, 2) # [10, B, 1, N] all_g = tf.tile( all_g, [1, 1, self.sub_goal_steps, 1]) # [10, B, T, N] all_g = tf.reshape(all_g, [-1, tf.shape(all_g)[-1]]) # [10*B*T, N] all_g = all_g - ss[:, self.fn_goal_dim:] # [10*B*T, N] feat = tf.concat([ss, all_g], axis=-1) # [10*B*T, *] _aa = self.low_actor(feat) # [10*B*T, A] if not self.is_continuous: _aa = tf.one_hot(tf.argmax(_aa, axis=-1), self.a_dim, dtype=tf.float32) diff = _aa - aa diff = tf.reshape( diff, [self.sample_g_nums, batchs, self.sub_goal_steps, -1 ]) # [10, B, T, A] diff = tf.transpose(diff, [1, 0, 2, 3]) # [B, 10, T, A] logps = -0.5 * tf.reduce_sum(tf.norm(diff, ord=2, axis=-1)**2, axis=-1) # [B, 10] idx = tf.argmax(logps, axis=-1, output_type=tf.int32) idx = tf.stack([tf.range(batchs), idx], axis=1) # [B, 2] g = tf.gather_nd(tf.transpose(gs, [1, 0, 2]), idx) # [B, N] q1, q2 = self.high_critic(s, g) q = tf.minimum(q1, q2) target_sub_goal = self.high_actor_target(s_) * self.high_scale target_sub_goal = tf.clip_by_value( target_sub_goal + self.high_noise(), -self.high_scale, self.high_scale) q_target = self.high_critic_target.get_min(s_, target_sub_goal) dc_r = tf.stop_gradient(r + self.gamma * (1 - done) * q_target) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1)) q2_loss = tf.reduce_mean(tf.square(td_error2)) high_critic_loss = q1_loss + q2_loss high_critic_grads = tape.gradient(high_critic_loss, self.high_critic.weights) self.high_critic_optimizer.apply_gradients( zip(high_critic_grads, self.high_critic.weights)) with tf.GradientTape() as tape: mu = self.high_actor(s) * self.high_scale q_actor = self.high_critic.Q1(s, mu) high_actor_loss = -tf.reduce_mean(q_actor) high_actor_grads = tape.gradient( high_actor_loss, self.high_actor.trainable_variables) self.high_actor_optimizer.apply_gradients( zip(high_actor_grads, self.high_actor.trainable_variables)) return dict([['LOSS/high_actor_loss', high_actor_loss], ['LOSS/high_critic_loss', high_critic_loss], ['Statistics/high_q_min', tf.reduce_min(q)], ['Statistics/high_q_mean', tf.reduce_mean(q)], ['Statistics/high_q_max', tf.reduce_max(q)]]) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance(a, np.ndarray), "store need action type is np.ndarray" assert isinstance(r, np.ndarray), "store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "store need done type is np.ndarray" [o.append(_s) for o, _s in zip(self._high_s, s)] [o.append(_a) for o, _a in zip(self._high_a, a)] [o.append(_r) for o, _r in zip(self._high_r, r)] [o.append(_s_) for o, _s_ in zip(self._high_s_, s_)] [o.append(_d) for o, _d in zip(self._done, done)] [ o.append(_subgoal) for o, _subgoal in zip(self._subgoals, self._noop_subgoal) ] ir = self.get_ir(s[:, self.fn_goal_dim:], self._noop_subgoal, s_[:, self.fn_goal_dim:]) # subgoal = s[:, self.fn_goal_dim:] + self._noop_subgoal - s_[:, self.fn_goal_dim:] subgoal = np.random.uniform(-self.high_scale, self.high_scale, size=(self.n_agents, self.sub_goal_dim)) self.data_low.add( s, a, ir, s_, done[:, np.newaxis], # 升维 self._noop_subgoal, subgoal) self._noop_subgoal = subgoal def store_data(self, s, visual_s, a, r, s_, visual_s_, done): """ for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer. """ assert isinstance(a, np.ndarray), "store need action type is np.ndarray" assert isinstance(r, np.ndarray), "store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "store need done type is np.ndarray" [o.append(_s) for o, _s in zip(self._high_s, s)] [o.append(_a) for o, _a in zip(self._high_a, a)] [o.append(_r) for o, _r in zip(self._high_r, r)] [o.append(_s_) for o, _s_ in zip(self._high_s_, s_)] [o.append(_d) for o, _d in zip(self._done, done)] [ o.append(_subgoal) for o, _subgoal in zip(self._subgoals, self._subgoal) ] ir = self.get_ir(s[:, self.fn_goal_dim:], self._subgoal, s_[:, self.fn_goal_dim:]) self._new_subgoal = np.where( self._c == 1, self.get_subgoal(s_).numpy(), s[:, self.fn_goal_dim:] + self._subgoal - s_[:, self.fn_goal_dim:]) self.data_low.add( s, a, ir, s_, done[:, np.newaxis], # 升维 self._subgoal, self._new_subgoal) self._c = np.where( self._c == 1, np.full((self.n_agents, 1), self.sub_goal_steps, np.int32), self._c - 1) def get_transitions(self, databuffer, data_name_list=['s', 'a', 'r', 's_', 'done']): ''' TODO: Annotation ''' data = databuffer.sample() # 经验池取数据 if not self.is_continuous and 'a' in data_name_list: a_idx = data_name_list.index('a') a = data[a_idx].astype(np.int32) pre_shape = a.shape a = a.reshape(-1) a = sth.int2one_hot(a, self.a_dim) a = a.reshape(pre_shape + (-1, )) data[a_idx] = a return dict([[ n, d ] for n, d in zip(data_name_list, list(map(self.data_convert, data)))])
class MAXSQN(make_off_policy_class(mode='share')): ''' https://github.com/createamind/DRL/blob/master/spinup/algos/maxsqn/maxsqn.py ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, beta=0.1, ployak=0.995, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, use_epsilon=False, q_lr=5.0e-4, alpha_lr=5.0e-4, auto_adaption=True, hidden_units=[32, 32], **kwargs): assert not is_continuous, 'maxsqn only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.use_epsilon = use_epsilon self.ployak = ployak self.log_alpha = alpha if not auto_adaption else tf.Variable( initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) self.auto_adaption = auto_adaption self.target_entropy = beta * np.log(self.a_dim) def _q_net(): return rls.critic_q_all(self.feat_dim, self.a_dim, hidden_units) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights) self.q_lr, self.alpha_lr = map(self.init_lr, [q_lr, alpha_lr]) self.optimizer_critic, self.optimizer_alpha = map( self.init_optimizer, [self.q_lr, self.alpha_lr]) self.model_recorder( dict(critic_net=self.critic_net, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha)) def show_logo(self): self.recorder.logger.info(''' xx xx xxxxxx xxxxxx xxxx xx xxx xxx xxx xxx xxxx xxx xxxx xx xxx xxx xxxxx x xx xx xx xx xxxxx xx xxxx xxx xxxxxx xx xxx xxxxxx xx xxx xx xxx xx xxxx xx x x xxx xxxxx xxxxxx xx xx xx xxxxx xxxx xx x xxxxxx xxx xxx xxx x xxx xx xxxx xx xxx x xxx xx xxx xx xx xx xxxxx xx xxxx xx xxx x xx xxx xxxxx xxxxxxxxx xxx xxxx xx xxx xx xxx x xxxxxxxx xxx xxx xxxxxxx xxxxxxx xx xx xxxxxxx ''') @property def alpha(self): return tf.exp(self.log_alpha) def choose_action(self, s, visual_s, evaluation=False): if self.use_epsilon and np.random.uniform( ) < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation): a = np.random.randint(0, self.a_dim, self.n_agents) else: mu, pi, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = pi.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) q = self.critic_net.Q1(feat) cate_dist = tfp.distributions.Categorical(logits=q / self.alpha) pi = cate_dist.sample() return tf.argmax(q, axis=1), pi, cell_state def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': self.train, 'update_function': lambda: self.update_target_net_weights( self.critic_target_net.weights, self.critic_net. weights, self.ployak), 'summary_dict': dict([['LEARNING_RATE/q_lr', self.q_lr(self.train_step)], [ 'LEARNING_RATE/alpha_lr', self.alpha_lr(self.train_step) ]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) q1, q2 = self.critic_net(feat) q1_eval = tf.reduce_sum(tf.multiply(q1, a), axis=1, keepdims=True) q2_eval = tf.reduce_sum(tf.multiply(q2, a), axis=1, keepdims=True) q1_target, q2_target = self.critic_target_net(feat_) q1_target_max = tf.reduce_max(q1_target, axis=1, keepdims=True) q1_target_log_probs = tf.nn.log_softmax(q1_target / self.alpha, axis=1) + 1e-8 q1_target_entropy = -tf.reduce_mean( tf.reduce_sum( tf.exp(q1_target_log_probs) * q1_target_log_probs, axis=1, keepdims=True)) q2_target_max = tf.reduce_max(q2_target, axis=1, keepdims=True) # q2_target_log_probs = tf.nn.log_softmax(q2_target, axis=1) # q2_target_log_max = tf.reduce_max(q2_target_log_probs, axis=1, keepdims=True) q_target = tf.minimum( q1_target_max, q2_target_max) + self.alpha * q1_target_entropy dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done)) td_error1 = q1_eval - dc_r td_error2 = q2_eval - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) loss = 0.5 * (q1_loss + q2_loss) + crsty_loss loss_grads = tape.gradient(loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(loss_grads, self.critic_tv)) if self.auto_adaption: with tf.GradientTape() as tape: q1 = self.critic_net.Q1(feat) q1_log_probs = tf.nn.log_softmax(q1 / self.alpha, axis=1) + 1e-8 q1_entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(q1_log_probs) * q1_log_probs, axis=1, keepdims=True)) alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(self.target_entropy - q1_entropy)) alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients([(alpha_grad, self.log_alpha)]) self.global_step.assign_add(1) summaries = dict( [['LOSS/loss', loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/q1_entropy', q1_entropy], ['Statistics/q_min', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(q1)], ['Statistics/q_max', tf.reduce_mean(tf.maximum(q1, q2))]]) if self.auto_adaption: summaries.update({'LOSS/alpha_loss': alpha_loss}) return (td_error1 + td_error2) / 2, summaries