def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, beta=0.1, ployak=0.995, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, use_epsilon=False, q_lr=5.0e-4, alpha_lr=5.0e-4, auto_adaption=True, hidden_units=[32, 32], **kwargs): assert not is_continuous, 'maxsqn only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.use_epsilon = use_epsilon self.ployak = ployak self.log_alpha = alpha if not auto_adaption else tf.Variable( initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) self.auto_adaption = auto_adaption self.target_entropy = beta * np.log(self.a_dim) def _q_net(): return rls.critic_q_all(self.feat_dim, self.a_dim, hidden_units) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights) self.q_lr, self.alpha_lr = map(self.init_lr, [q_lr, alpha_lr]) self.optimizer_critic, self.optimizer_alpha = map( self.init_optimizer, [self.q_lr, self.alpha_lr]) self.model_recorder( dict(critic_net=self.critic_net, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha))
def __init__(self, s_dim, a_dim, is_continuous, ployak=0.995, actor_lr=5.0e-4, critic_lr=1.0e-3, n=1, i=0, hidden_units={ 'actor': [32, 32], 'q': [32, 32] }, **kwargs): assert is_continuous, 'matd3 only support continuous action space' raise Exception('MA系列存在问题,还未修复') super().__init__(s_dim=s_dim, visual_sources=0, visual_resolution=0, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.n = n self.i = i self.ployak = ployak # self.action_noise = rls.NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim)) self.action_noise = rls.OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim)) def _actor_net(): return rls.actor_dpg(self.s_dim, 0, self.a_dim, hidden_units['actor']) self.actor_net = _actor_net() self.actor_target_net = _actor_net() def _q_net(): return rls.critic_q_one((self.s_dim) * self.n, 0, (self.a_dim) * self.n, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.update_target_net_weights( self.actor_target_net.weights + self.critic_target_net.weights, self.actor_net.weights + self.critic_net.weights) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder( dict(actor=self.actor_net, critic_net=self.critic_net, optimizer_critic=self.optimizer_critic, optimizer_actor=self.optimizer_actor)) self.recorder.logger.info(self.action_noise)
class MATD3(Policy): def __init__(self, s_dim, a_dim, is_continuous, ployak=0.995, actor_lr=5.0e-4, critic_lr=1.0e-3, n=1, i=0, hidden_units={ 'actor': [32, 32], 'q': [32, 32] }, **kwargs): assert is_continuous, 'matd3 only support continuous action space' raise Exception('MA系列存在问题,还未修复') super().__init__(s_dim=s_dim, visual_sources=0, visual_resolution=0, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.n = n self.i = i self.ployak = ployak # self.action_noise = rls.NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim)) self.action_noise = rls.OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim)) def _actor_net(): return rls.actor_dpg(self.s_dim, 0, self.a_dim, hidden_units['actor']) self.actor_net = _actor_net() self.actor_target_net = _actor_net() def _q_net(): return rls.critic_q_one((self.s_dim) * self.n, 0, (self.a_dim) * self.n, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.update_target_net_weights( self.actor_target_net.weights + self.critic_target_net.weights, self.actor_net.weights + self.critic_net.weights) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder( dict(actor=self.actor_net, critic_net=self.critic_net, optimizer_critic=self.optimizer_critic, optimizer_actor=self.optimizer_actor)) self.recorder.logger.info(self.action_noise) def show_logo(self): self.recorder.logger.info(''' xxxx xxx xx xxxxxxxxx xxxxxxx xxxxx xxx xx xxx xx x xx x xxx xx xx xxx xxx xxx xx x xx x xx xx xx xxx xxx x xx x x xx xxx xxxx x x xx xx x x xxx xxxx x xxxx x xxxxxx x x xx xxx x xxx x xx xx x x xx xx xx x xx x xx xx x x xxx xx xxx xxxx xxxxxx xxx xxxxx xxxxx xxxxxxx xxxxx ''') def choose_action(self, s, evaluation=False): return self._get_action(s, evaluation).numpy() def get_target_action(self, s): return self._get_target_action(s).numpy() @tf.function def _get_action(self, vector_input, evaluation): vector_input = self.cast(vector_input) with tf.device(self.device): mu = self.actor_net(vector_input) if evaluation == True: return mu else: return tf.clip_by_value(mu + self.action_noise(), -1, 1) @tf.function def _get_target_action(self, vector_input): vector_input = self.cast(vector_input) with tf.device(self.device): target_mu = self.actor_target_net(vector_input) return tf.clip_by_value(target_mu + self.action_noise(), -1, 1) def learn(self, episode, ap, al, ss, ss_, aa, aa_, s, r): ap, al, ss, ss_, aa, aa_, s, r = map(self.data_convert, (ap, al, ss, ss_, aa, aa_, s, r)) summaries = self.train(ap, al, ss, ss_, aa, aa_, s, r) self.update_target_net_weights( self.actor_target_net.weights + self.critic_target_net.weights, self.actor_net.weights + self.critic_net.weights, self.ployak) summaries.update( dict([['LEARNING_RATE/actor_lr', self.actor_lr(self.train_step)], ['LEARNING_RATE/critic_lr', self.critic_lr(self.train_step)]])) self.write_training_summaries(self.global_step, summaries) @tf.function(experimental_relax_shapes=True) def train(self, q_actor_a_previous, q_actor_a_later, ss, ss_, aa, aa_, s, r): with tf.device(self.device): for _ in range(2): with tf.GradientTape() as tape: q1, q2 = self.critic_net(ss, aa) q_target = self.critic_target_net.get_min(ss_, aa_) dc_r = tf.stop_gradient(r + self.gamma * q_target) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1)) q2_loss = tf.reduce_mean(tf.square(td_error2)) critic_loss = 0.5 * (q1_loss + q2_loss) critic_grads = tape.gradient( critic_loss, self.critic_net.trainable_variables) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_net.trainable_variables)) with tf.GradientTape() as tape: mu = self.actor_net(s) mumu = tf.concat((q_actor_a_previous, mu, q_actor_a_later), axis=1) q1_actor = self.critic_net.Q1(ss, mumu) actor_loss = -tf.reduce_mean(q1_actor) actor_grads = tape.gradient(actor_loss, self.actor_net.trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_net.trainable_variables)) self.global_step.assign_add(1) return dict([['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss]]) @tf.function(experimental_relax_shapes=True) def train_persistent(self, q_actor_a_previous, q_actor_a_later, ss, ss_, aa, aa_, s, r): with tf.device(self.device): for _ in range(2): with tf.GradientTape(persistent=True) as tape: mu = self.actor_net(s) mumu = tf.concat((q_actor_a_previous, mu, q_actor_a_later), axis=1) q1, q2 = self.critic_net(ss, aa) q_target = self.critic_target_net.get_min(ss_, aa_) q1_actor = self.critic_net.Q1(ss, mumu) dc_r = tf.stop_gradient(r + self.gamma * q_target) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1)) q2_loss = tf.reduce_mean(tf.square(td_error2)) critic_loss = 0.5 * (q1_loss + q2_loss) actor_loss = -tf.reduce_mean(q1_actor) critic_grads = tape.gradient( critic_loss, self.critic_net.trainable_variables) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_net.trainable_variables)) actor_grads = tape.gradient(actor_loss, self.actor_net.trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_net.trainable_variables)) self.global_step.assign_add(1) return dict([['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss]])
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, annealing=True, last_alpha=0.01, ployak=0.995, entropic_index=1.5, discrete_tau=1.0, log_std_bound=[-20, 2], hidden_units={ 'actor_continuous': { 'share': [128, 128], 'mu': [64], 'log_std': [64] }, 'actor_discrete': [64, 32], 'q': [128, 128] }, auto_adaption=True, actor_lr=5.0e-4, critic_lr=1.0e-3, alpha_lr=5.0e-4, **kwargs): super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak self.discrete_tau = discrete_tau self.entropic_index = 2 - entropic_index self.log_std_min, self.log_std_max = log_std_bound[:] self.auto_adaption = auto_adaption self.annealing = annealing if self.auto_adaption: self.log_alpha = tf.Variable(initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) else: self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha), name='log_alpha', dtype=tf.float32, trainable=False) if self.annealing: self.alpha_annealing = LinearAnnealing(alpha, last_alpha, 1e6) if self.is_continuous: self.actor_net = rls.actor_continuous(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) else: self.actor_net = rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_tv = self.actor_net.trainable_variables # entropy = -log(1/|A|) = log |A| self.target_entropy = 0.98 * (self.a_dim if self.is_continuous else np.log(self.a_dim)) def _q_net(): return rls.critic_q_one(self.feat_dim, self.a_dim, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights) self.actor_lr, self.critic_lr, self.alpha_lr = map(self.init_lr, [actor_lr, critic_lr, alpha_lr]) self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha = map(self.init_optimizer, [self.actor_lr, self.critic_lr, self.alpha_lr]) self.model_recorder(dict( actor=self.actor_net, critic_net=self.critic_net, log_alpha=self.log_alpha, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha, ))
class TAC(make_off_policy_class(mode='share')): """Tsallis Actor Critic, TAC with V neural Network. https://arxiv.org/abs/1902.00137 """ def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, annealing=True, last_alpha=0.01, ployak=0.995, entropic_index=1.5, discrete_tau=1.0, log_std_bound=[-20, 2], hidden_units={ 'actor_continuous': { 'share': [128, 128], 'mu': [64], 'log_std': [64] }, 'actor_discrete': [64, 32], 'q': [128, 128] }, auto_adaption=True, actor_lr=5.0e-4, critic_lr=1.0e-3, alpha_lr=5.0e-4, **kwargs): super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak self.discrete_tau = discrete_tau self.entropic_index = 2 - entropic_index self.log_std_min, self.log_std_max = log_std_bound[:] self.auto_adaption = auto_adaption self.annealing = annealing if self.auto_adaption: self.log_alpha = tf.Variable(initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) else: self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha), name='log_alpha', dtype=tf.float32, trainable=False) if self.annealing: self.alpha_annealing = LinearAnnealing(alpha, last_alpha, 1e6) if self.is_continuous: self.actor_net = rls.actor_continuous(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) else: self.actor_net = rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_tv = self.actor_net.trainable_variables # entropy = -log(1/|A|) = log |A| self.target_entropy = 0.98 * (self.a_dim if self.is_continuous else np.log(self.a_dim)) def _q_net(): return rls.critic_q_one(self.feat_dim, self.a_dim, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights) self.actor_lr, self.critic_lr, self.alpha_lr = map(self.init_lr, [actor_lr, critic_lr, alpha_lr]) self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha = map(self.init_optimizer, [self.actor_lr, self.critic_lr, self.alpha_lr]) self.model_recorder(dict( actor=self.actor_net, critic_net=self.critic_net, log_alpha=self.log_alpha, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha, )) def show_logo(self): self.recorder.logger.info(''' xxxxxxxxx xx xxxxxx xx x xx xxx xxx xx xx x xx xxx xx xx x x xx xx x xx xx xxx x xxxxxx xxx x xx xx xx xx x xx xx xxx xxx xxxxx xxx xxxxx xxxxxx ''') @property def alpha(self): return tf.exp(self.log_alpha) def choose_action(self, s, visual_s, evaluation=False): mu, pi, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = mu.numpy() if evaluation else pi.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, _ = tsallis_squash_rsample(mu, log_std, self.entropic_index) mu = tf.tanh(mu) # squash mu else: logits = self.actor_net(feat) mu = tf.argmax(logits, axis=1) cate_dist = tfp.distributions.Categorical(logits) pi = cate_dist.sample() return mu, pi, cell_state def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _train(memories, isw, crsty_loss, cell_state): td_error, summaries = self.train(memories, isw, crsty_loss, cell_state) if self.annealing and not self.auto_adaption: self.log_alpha.assign(tf.math.log(tf.cast(self.alpha_annealing(self.global_step.numpy()), tf.float32))) return td_error, summaries for i in range(kwargs['step']): self._learn(function_dict={ 'train_function': self.train, 'update_function': lambda: self.update_target_net_weights( self.critic_target_net.weights, self.critic_net.weights, self.ployak), 'summary_dict': dict([ ['LEARNING_RATE/actor_lr', self.actor_lr(self.train_step)], ['LEARNING_RATE/critic_lr', self.critic_lr(self.train_step)], ['LEARNING_RATE/alpha_lr', self.alpha_lr(self.train_step)] ]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: target_mu, target_log_std = self.actor_net(feat_) target_log_std = clip_nn_log_std(target_log_std) target_pi, target_log_pi = tsallis_squash_rsample(target_mu, target_log_std, self.entropic_index) else: target_logits = self.actor_net(feat_) target_cate_dist = tfp.distributions.Categorical(target_logits) target_pi = target_cate_dist.sample() target_log_pi = target_cate_dist.log_prob(target_pi) target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32) q1, q2 = self.critic_net(feat, a) q1_target, q2_target = self.critic_target_net(feat_, target_pi) dc_r_q1 = tf.stop_gradient(r + self.gamma * (1 - done) * (q1_target - self.alpha * target_log_pi)) dc_r_q2 = tf.stop_gradient(r + self.gamma * (1 - done) * (q2_target - self.alpha * target_log_pi)) td_error1 = q1 - dc_r_q1 td_error2 = q2 - dc_r_q2 q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv) ) with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = tsallis_squash_rsample(mu, log_std, self.entropic_index) entropy = gaussian_entropy(log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) gumbel_noise = tf.cast(self.gumbel_dist.sample([batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax((logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) pi = _pi_diff + _pi log_pi = tf.reduce_sum(tf.multiply(logp_all, pi), axis=1, keepdims=True) entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) q_s_pi = self.critic_net.get_min(feat, pi) actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv) ) if self.auto_adaption: with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = tsallis_squash_rsample(mu, log_std, self.entropic_index) else: logits = self.actor_net(feat) cate_dist = tfp.distributions.Categorical(logits) log_pi = cate_dist.log_prob(cate_dist.sample()) alpha_loss = -tf.reduce_mean(self.alpha * tf.stop_gradient(log_pi - self.target_entropy)) alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients( [(alpha_grad, self.log_alpha)] ) self.global_step.assign_add(1) summaries = dict([ ['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', entropy], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))] ]) if self.auto_adaption: summaries.update({ 'LOSS/alpha_loss': alpha_loss }) return (td_error1 + td_error2) / 2, summaries @tf.function(experimental_relax_shapes=True) def train_persistent(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = tsallis_squash_rsample(mu, log_std, self.entropic_index) entropy = gaussian_entropy(log_std) target_mu, target_log_std = self.actor_net(feat_) target_log_std = clip_nn_log_std(target_log_std) target_pi, target_log_pi = tsallis_squash_rsample(target_mu, target_log_std, self.entropic_index) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) gumbel_noise = tf.cast(self.gumbel_dist.sample([batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax((logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) pi = _pi_diff + _pi log_pi = tf.reduce_sum(tf.multiply(logp_all, pi), axis=1, keepdims=True) entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) target_logits = self.actor_net(feat_) target_cate_dist = tfp.distributions.Categorical(target_logits) target_pi = target_cate_dist.sample() target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32) target_log_pi = target_cate_dist.log_prob(target_pi) q1, q2 = self.critic_net(feat, a) q1_target, q2_target = self.critic_target_net(feat_, target_pi) q_s_pi = self.critic_net.get_min(feat, pi) dc_r_q1 = tf.stop_gradient(r + self.gamma * (1 - done) * (q1_target - self.alpha * target_log_pi)) dc_r_q2 = tf.stop_gradient(r + self.gamma * (1 - done) * (q2_target - self.alpha * target_log_pi)) td_error1 = q1 - dc_r_q1 td_error2 = q2 - dc_r_q2 q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi) if self.auto_adaption: alpha_loss = -tf.reduce_mean(self.alpha * tf.stop_gradient(log_pi - self.target_entropy)) critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv) ) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv) ) if self.auto_adaption: alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients( [(alpha_grad, self.log_alpha)] ) self.global_step.assign_add(1) summaries = dict([ ['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', entropy], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))] ]) if self.auto_adaption: summaries.update({ 'LOSS/alpha_loss': alpha_loss }) return (td_error1 + td_error2) / 2, summaries
class TD3(make_off_policy_class(mode='share')): ''' Twin Delayed Deep Deterministic Policy Gradient, https://arxiv.org/abs/1802.09477 ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, delay_num=2, noise_type='gaussian', gaussian_noise_sigma=0.2, gaussian_noise_bound=0.2, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak self.delay_num = delay_num self.discrete_tau = discrete_tau self.gaussian_noise_sigma = gaussian_noise_sigma self.gaussian_noise_bound = gaussian_noise_bound if self.is_continuous: def _actor_net(): return rls.actor_dpg(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) if noise_type == 'gaussian': self.action_noise = rls.ClippedNormalActionNoise( mu=np.zeros(self.a_dim), sigma=self.gaussian_noise_sigma * np.ones(self.a_dim), bound=self.gaussian_noise_bound) elif noise_type == 'ou': self.action_noise = rls.OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim)) else: def _actor_net(): return rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_net = _actor_net() self.actor_target_net = _actor_net() self.actor_tv = self.actor_net.trainable_variables def _q_net(): return rls.critic_q_one(self.feat_dim, self.a_dim, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.update_target_net_weights( self.actor_target_net.weights + self.critic_target_net.weights, self.actor_net.weights + self.critic_net.weights) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder( dict(actor=self.actor_net, critic_net=self.critic_net, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic)) def show_logo(self): self.recorder.logger.info(''' xxxxxxxxx xxxxxxx xxxxx xx x xx x xxx xx xx xx x xx x xx xx xx x x xx xxx x x xxx xxxx x x xx xxx x x xx xx xx x x xxx xx xxx xxxxx xxxxxxx xxxxx ''') def choose_action(self, s, visual_s, evaluation=False): mu, pi, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = mu.numpy() if evaluation else pi.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) if self.is_continuous: mu = self.actor_net(feat) pi = tf.clip_by_value(mu + self.action_noise(), -1, 1) else: logits = self.actor_net(feat) mu = tf.argmax(logits, axis=1) cate_dist = tfp.distributions.Categorical(logits) pi = cate_dist.sample() return mu, pi, cell_state def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(kwargs['step']): self._learn( function_dict={ 'train_function': self.train, 'update_function': lambda: self.update_target_net_weights( self.actor_target_net.weights + self.critic_target_net. weights, self.actor_net.weights + self.critic_net. weights, self.ployak), 'summary_dict': dict([[ 'LEARNING_RATE/actor_lr', self.actor_lr(self.train_step) ], [ 'LEARNING_RATE/critic_lr', self.critic_lr(self.train_step) ]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): for _ in range(self.delay_num): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: target_mu = self.actor_target_net(feat_) action_target = tf.clip_by_value( target_mu + self.action_noise(), -1, 1) else: target_logits = self.actor_target_net(feat_) logp_all = tf.nn.log_softmax(target_logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) action_target = _pi_diff + _pi q1, q2 = self.critic_net(feat, a) q_target = self.critic_target_net.get_min( feat_, action_target) dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done)) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * (q1_loss + q2_loss) + crsty_loss critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) with tf.GradientTape() as tape: if self.is_continuous: mu = self.actor_net(feat) else: logits = self.actor_net(feat) _pi = tf.nn.softmax(logits) _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1), self.a_dim, dtype=tf.float32) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) mu = _pi_diff + _pi q1_actor = self.critic_net.Q1(feat, mu) actor_loss = -tf.reduce_mean(q1_actor) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) self.global_step.assign_add(1) return (td_error1 + td_error2) / 2, dict([ ['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))], ]) @tf.function(experimental_relax_shapes=True) def train_persistent(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): for _ in range(2): with tf.GradientTape(persistent=True) as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: target_mu = self.actor_target_net(feat_) action_target = tf.clip_by_value( target_mu + self.action_noise(), -1, 1) mu = self.actor_net(feat) else: target_logits = self.actor_target_net(feat_) logp_all = tf.nn.log_softmax(target_logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) action_target = _pi_diff + _pi logits = self.actor_net(feat) _pi = tf.nn.softmax(logits) _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1), self.a_dim, dtype=tf.float32) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) mu = _pi_diff + _pi q1, q2 = self.critic_net(feat, a) q_target = self.critic_target_net.get_min( feat_, action_target) q1_actor = self.critic_net.Q1(feat, mu) dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done)) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * (q1_loss + q2_loss) + crsty_loss actor_loss = -tf.reduce_mean(q1_actor) critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) self.global_step.assign_add(1) return (td_error1 + td_error2) / 2, dict( [['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))]])
def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, delay_num=2, noise_type='gaussian', gaussian_noise_sigma=0.2, gaussian_noise_bound=0.2, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak self.delay_num = delay_num self.discrete_tau = discrete_tau self.gaussian_noise_sigma = gaussian_noise_sigma self.gaussian_noise_bound = gaussian_noise_bound if self.is_continuous: def _actor_net(): return rls.actor_dpg(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) if noise_type == 'gaussian': self.action_noise = rls.ClippedNormalActionNoise( mu=np.zeros(self.a_dim), sigma=self.gaussian_noise_sigma * np.ones(self.a_dim), bound=self.gaussian_noise_bound) elif noise_type == 'ou': self.action_noise = rls.OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim)) else: def _actor_net(): return rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_net = _actor_net() self.actor_target_net = _actor_net() self.actor_tv = self.actor_net.trainable_variables def _q_net(): return rls.critic_q_one(self.feat_dim, self.a_dim, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.update_target_net_weights( self.actor_target_net.weights + self.critic_target_net.weights, self.actor_net.weights + self.critic_net.weights) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder( dict(actor=self.actor_net, critic_net=self.critic_net, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic))
class SAC(make_off_policy_class(mode='share')): """ Soft Actor-Critic Algorithms and Applications. https://arxiv.org/abs/1812.05905 Soft Actor-Critic for Discrete Action Settings. https://arxiv.org/abs/1910.07207 """ def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, annealing=True, last_alpha=0.01, ployak=0.995, use_gumbel=True, discrete_tau=1.0, log_std_bound=[-20, 2], hidden_units={ 'actor_continuous': { 'share': [128, 128], 'mu': [64], 'log_std': [64] }, 'actor_discrete': [64, 32], 'q': [128, 128] }, auto_adaption=True, actor_lr=5.0e-4, critic_lr=1.0e-3, alpha_lr=5.0e-4, **kwargs): super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak self.use_gumbel = use_gumbel self.discrete_tau = discrete_tau self.log_std_min, self.log_std_max = log_std_bound[:] self.auto_adaption = auto_adaption self.annealing = annealing if self.auto_adaption: self.log_alpha = tf.Variable(initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) else: self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha), name='log_alpha', dtype=tf.float32, trainable=False) if self.annealing: self.alpha_annealing = LinearAnnealing(alpha, last_alpha, 1e6) if self.is_continuous: self.actor_net = rls.actor_continuous(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) else: self.actor_net = rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) if self.use_gumbel: self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_tv = self.actor_net.trainable_variables # entropy = -log(1/|A|) = log |A| self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else np.log(self.a_dim)) if self.is_continuous or self.use_gumbel: critic_net = rls.critic_q_one else: critic_net = rls.critic_q_all def _q_net(): return critic_net(self.feat_dim, self.a_dim, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights) self.actor_lr, self.critic_lr, self.alpha_lr = map(self.init_lr, [actor_lr, critic_lr, alpha_lr]) self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha = map(self.init_optimizer, [self.actor_lr, self.critic_lr, self.alpha_lr]) self.model_recorder(dict( actor=self.actor_net, critic_net=self.critic_net, log_alpha=self.log_alpha, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha, )) def show_logo(self): self.recorder.logger.info(''' xxxxxxx xx xxxxxx xx xx xxx xxx xx xx x xxx xx xx xxxx x xx xx xxxxxx xx xx xxx xxx xxxxxx xxx x xx xx xx xx xx xx xx xx xx xxx xxx xxxxxxx xxx xxxxx xxxxxx ''') def choose_action(self, s, visual_s, evaluation=False): mu, pi, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = mu.numpy() if evaluation else pi.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, _ = squash_rsample(mu, log_std) mu = tf.tanh(mu) # squash mu else: logits = self.actor_net(feat) mu = tf.argmax(logits, axis=1) cate_dist = tfp.distributions.Categorical(logits) pi = cate_dist.sample() return mu, pi, cell_state def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _train(memories, isw, crsty_loss, cell_state): if self.is_continuous or self.use_gumbel: td_error, summaries = self.train_persistent(memories, isw, crsty_loss, cell_state) else: td_error, summaries = self.train_discrete(memories, isw, crsty_loss, cell_state) if self.annealing and not self.auto_adaption: self.log_alpha.assign(tf.math.log(tf.cast(self.alpha_annealing(self.global_step.numpy()), tf.float32))) return td_error, summaries for i in range(self.train_times_per_step): self._learn(function_dict={ 'train_function': _train, 'update_function': lambda: self.update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights, self.ployak), 'summary_dict': dict([ ['LEARNING_RATE/actor_lr', self.actor_lr(self.train_step)], ['LEARNING_RATE/critic_lr', self.critic_lr(self.train_step)], ['LEARNING_RATE/alpha_lr', self.alpha_lr(self.train_step)] ]) }) @property def alpha(self): return tf.exp(self.log_alpha) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: target_mu, target_log_std = self.actor_net(feat_) target_log_std = clip_nn_log_std(target_log_std) target_pi, target_log_pi = squash_rsample(target_mu, target_log_std) else: target_logits = self.actor_net(feat_) target_cate_dist = tfp.distributions.Categorical(target_logits) target_pi = target_cate_dist.sample() target_log_pi = target_cate_dist.log_prob(target_pi) target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32) q1, q2 = self.critic_net(feat, a) q1_target, q2_target = self.critic_target_net(feat_, target_pi) dc_r_q1 = tf.stop_gradient(r + self.gamma * (1 - done) * (q1_target - self.alpha * target_log_pi)) dc_r_q2 = tf.stop_gradient(r + self.gamma * (1 - done) * (q2_target - self.alpha * target_log_pi)) td_error1 = q1 - dc_r_q1 td_error2 = q2 - dc_r_q2 q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv) ) with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = squash_rsample(mu, log_std) entropy = gaussian_entropy(log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) gumbel_noise = tf.cast(self.gumbel_dist.sample([batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax((logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) pi = _pi_diff + _pi log_pi = tf.reduce_sum(tf.multiply(logp_all, pi), axis=1, keepdims=True) entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) q_s_pi = self.critic_net.get_min(feat, pi) actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv) ) if self.auto_adaption: with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) norm_dist = tfp.distributions.Normal(loc=mu, scale=tf.exp(log_std)) log_pi = tf.reduce_sum(norm_dist.log_prob(norm_dist.sample()), axis=-1) else: logits = self.actor_net(feat) cate_dist = tfp.distributions.Categorical(logits) log_pi = cate_dist.log_prob(cate_dist.sample()) # $J(\alpha)=\mathbb{E}_{\mathbf{a}_{t} \sim \pi_{t}}\left[-\alpha \log \pi_{t}\left(\mathbf{a}_{t} | \mathbf{s}_{t}\right)-\alpha \overline{\mathcal{H}}\right.$ # \overline{\mathcal{H}} is negative alpha_loss = -tf.reduce_mean(self.alpha * tf.stop_gradient(log_pi + self.target_entropy)) alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients( [(alpha_grad, self.log_alpha)] ) self.global_step.assign_add(1) summaries = dict([ ['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', entropy], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))] ]) if self.auto_adaption: summaries.update({ 'LOSS/alpha_loss': alpha_loss }) return (td_error1 + td_error2) / 2., summaries @tf.function(experimental_relax_shapes=True) def train_persistent(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = squash_rsample(mu, log_std) entropy = gaussian_entropy(log_std) target_mu, target_log_std = self.actor_net(feat_) target_log_std = clip_nn_log_std(target_log_std) target_pi, target_log_pi = squash_rsample(target_mu, target_log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) gumbel_noise = tf.cast(self.gumbel_dist.sample([batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax((logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) pi = _pi_diff + _pi log_pi = tf.reduce_sum(tf.multiply(logp_all, pi), axis=1, keepdims=True) entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) target_logits = self.actor_net(feat_) target_cate_dist = tfp.distributions.Categorical(target_logits) target_pi = target_cate_dist.sample() target_log_pi = target_cate_dist.log_prob(target_pi) target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32) q1, q2 = self.critic_net(feat, a) q1_target, q2_target = self.critic_target_net(feat_, target_pi) q_s_pi = self.critic_net.get_min(feat, pi) dc_r_q1 = tf.stop_gradient(r + self.gamma * (1 - done) * (q1_target - self.alpha * target_log_pi)) dc_r_q2 = tf.stop_gradient(r + self.gamma * (1 - done) * (q2_target - self.alpha * target_log_pi)) td_error1 = q1 - dc_r_q1 td_error2 = q2 - dc_r_q2 q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi) if self.auto_adaption: alpha_loss = -tf.reduce_mean(self.alpha * tf.stop_gradient(log_pi + self.target_entropy)) critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv) ) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv) ) if self.auto_adaption: alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients( [(alpha_grad, self.log_alpha)] ) self.global_step.assign_add(1) summaries = dict([ ['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', entropy], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))] ]) if self.auto_adaption: summaries.update({ 'LOSS/alpha_loss': alpha_loss }) return (td_error1 + td_error2) / 2, summaries @tf.function(experimental_relax_shapes=True) def train_discrete(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) q1_all, q2_all = self.critic_net(feat) # [B, A] def q_function(x): return tf.reduce_sum(x * a, axis=-1, keepdims=True) # [B, 1] q1 = q_function(q1_all) q2 = q_function(q2_all) target_logits = self.actor_net(feat_) # [B, A] target_log_probs = tf.nn.log_softmax(target_logits) # [B, A] q1_target, q2_target = self.critic_target_net(feat_) # [B, A] def v_target_function(x): return tf.reduce_sum(tf.exp(target_log_probs) * (x - self.alpha * target_log_probs), axis=-1, keepdims=True) # [B, 1] v1_target = v_target_function(q1_target) v2_target = v_target_function(q2_target) dc_r_q1 = tf.stop_gradient(r + self.gamma * (1 - done) * v1_target) dc_r_q2 = tf.stop_gradient(r + self.gamma * (1 - done) * v2_target) td_error1 = q1 - dc_r_q1 td_error2 = q2 - dc_r_q2 q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv) ) with tf.GradientTape() as tape: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) entropy = -tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True) # [B, 1] q1_all, q2_all = self.critic_net(feat) # [B, A] q_all = tf.minimum(q1_all, q2_all) # [B, A] actor_loss = -tf.reduce_mean( tf.reduce_sum((q_all - self.alpha * logp_all) * tf.exp(logp_all)) # [B, A] => [B,] ) # actor_loss = - tf.reduce_mean( # q_all + self.alpha * entropy # ) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv) ) if self.auto_adaption: with tf.GradientTape() as tape: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) entropy = -tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True) # [B, 1] corr = tf.stop_gradient(self.target_entropy - entropy) # corr = tf.stop_gradient(tf.reduce_sum((logp_all - self.a_dim) * tf.exp(logp_all), axis=-1)) #[B, A] => [B,] # J(\alpha)=\pi_{t}\left(s_{t}\right)^{T}\left[-\alpha\left(\log \left(\pi_{t}\left(s_{t}\right)\right)+\bar{H}\right)\right] # \bar{H} is negative alpha_loss = -tf.reduce_mean(self.alpha * corr) alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients( [(alpha_grad, self.log_alpha)] ) self.global_step.assign_add(1) summaries = dict([ ['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', tf.reduce_mean(entropy)] ]) if self.auto_adaption: summaries.update({ 'LOSS/alpha_loss': alpha_loss }) return (td_error1 + td_error2) / 2, summaries
def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, high_scale=1.0, reward_scale=1.0, sample_g_nums=100, sub_goal_steps=10, fn_goal_dim=0, intrinsic_reward_mode='os', high_batch_size=256, high_buffer_size=100000, low_batch_size=8, low_buffer_size=10000, high_actor_lr=1.0e-4, high_critic_lr=1.0e-3, low_actor_lr=1.0e-4, low_critic_lr=1.0e-3, hidden_units={ 'high_actor': [64, 64], 'high_critic': [64, 64], 'low_actor': [64, 64], 'low_critic': [64, 64] }, **kwargs): assert visual_sources == 0, 'HIRO doesn\'t support visual inputs.' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.data_high = ExperienceReplay(high_batch_size, high_buffer_size) self.data_low = ExperienceReplay(low_batch_size, low_buffer_size) self.ployak = ployak self.high_scale = np.array( high_scale if isinstance(high_scale, list) else [high_scale] * self.s_dim, dtype=np.float32) self.reward_scale = reward_scale self.fn_goal_dim = fn_goal_dim self.sample_g_nums = sample_g_nums self.sub_goal_steps = sub_goal_steps self.sub_goal_dim = self.s_dim - self.fn_goal_dim self.high_noise = rls.ClippedNormalActionNoise( mu=np.zeros(self.sub_goal_dim), sigma=self.high_scale * np.ones(self.sub_goal_dim), bound=self.high_scale / 2) self.low_noise = rls.ClippedNormalActionNoise(mu=np.zeros(self.a_dim), sigma=1.0 * np.ones(self.a_dim), bound=0.5) _high_actor_net = lambda: rls.actor_dpg(self.s_dim, self.sub_goal_dim, hidden_units['high_actor']) if self.is_continuous: _low_actor_net = lambda: rls.actor_dpg( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_actor']) else: _low_actor_net = lambda: rls.actor_discrete( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_actor']) self.gumbel_dist = tfd.Gumbel(0, 1) self.high_actor = _high_actor_net() self.high_actor_target = _high_actor_net() self.low_actor = _low_actor_net() self.low_actor_target = _low_actor_net() _high_critic_net = lambda: rls.critic_q_one( self.s_dim, self.sub_goal_dim, hidden_units['high_critic']) _low_critic_net = lambda: rls.critic_q_one( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_critic']) self.high_critic = DoubleQ(_high_critic_net) self.high_critic_target = DoubleQ(_high_critic_net) self.low_critic = DoubleQ(_low_critic_net) self.low_critic_target = DoubleQ(_low_critic_net) self.update_target_net_weights( self.low_actor_target.weights + self.low_critic_target.weights + self.high_actor_target.weights + self.high_critic_target.weights, self.low_actor.weights + self.low_critic.weights + self.high_actor.weights + self.high_critic.weights) self.low_actor_lr, self.low_critic_lr = map( self.init_lr, [low_actor_lr, low_critic_lr]) self.high_actor_lr, self.high_critic_lr = map( self.init_lr, [high_actor_lr, high_critic_lr]) self.low_actor_optimizer, self.low_critic_optimizer = map( self.init_optimizer, [self.low_actor_lr, self.low_critic_lr]) self.high_actor_optimizer, self.high_critic_optimizer = map( self.init_optimizer, [self.high_actor_lr, self.high_critic_lr]) self.model_recorder( dict(high_actor=self.high_actor, high_critic=self.high_critic, low_actor=self.low_actor, low_critic=self.low_critic, low_actor_optimizer=self.low_actor_optimizer, low_critic_optimizer=self.low_critic_optimizer, high_actor_optimizer=self.high_actor_optimizer, high_critic_optimizer=self.high_critic_optimizer)) self.counts = 0 self._high_s = [[] for _ in range(self.n_agents)] self._noop_subgoal = np.random.uniform(-self.high_scale, self.high_scale, size=(self.n_agents, self.sub_goal_dim)) self.get_ir = self.generate_ir_func(mode=intrinsic_reward_mode)
class HIRO(make_off_policy_class(mode='no_share')): ''' Data-Efficient Hierarchical Reinforcement Learning, http://arxiv.org/abs/1805.08296 ''' def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, high_scale=1.0, reward_scale=1.0, sample_g_nums=100, sub_goal_steps=10, fn_goal_dim=0, intrinsic_reward_mode='os', high_batch_size=256, high_buffer_size=100000, low_batch_size=8, low_buffer_size=10000, high_actor_lr=1.0e-4, high_critic_lr=1.0e-3, low_actor_lr=1.0e-4, low_critic_lr=1.0e-3, hidden_units={ 'high_actor': [64, 64], 'high_critic': [64, 64], 'low_actor': [64, 64], 'low_critic': [64, 64] }, **kwargs): assert visual_sources == 0, 'HIRO doesn\'t support visual inputs.' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.data_high = ExperienceReplay(high_batch_size, high_buffer_size) self.data_low = ExperienceReplay(low_batch_size, low_buffer_size) self.ployak = ployak self.high_scale = np.array( high_scale if isinstance(high_scale, list) else [high_scale] * self.s_dim, dtype=np.float32) self.reward_scale = reward_scale self.fn_goal_dim = fn_goal_dim self.sample_g_nums = sample_g_nums self.sub_goal_steps = sub_goal_steps self.sub_goal_dim = self.s_dim - self.fn_goal_dim self.high_noise = rls.ClippedNormalActionNoise( mu=np.zeros(self.sub_goal_dim), sigma=self.high_scale * np.ones(self.sub_goal_dim), bound=self.high_scale / 2) self.low_noise = rls.ClippedNormalActionNoise(mu=np.zeros(self.a_dim), sigma=1.0 * np.ones(self.a_dim), bound=0.5) _high_actor_net = lambda: rls.actor_dpg(self.s_dim, self.sub_goal_dim, hidden_units['high_actor']) if self.is_continuous: _low_actor_net = lambda: rls.actor_dpg( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_actor']) else: _low_actor_net = lambda: rls.actor_discrete( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_actor']) self.gumbel_dist = tfd.Gumbel(0, 1) self.high_actor = _high_actor_net() self.high_actor_target = _high_actor_net() self.low_actor = _low_actor_net() self.low_actor_target = _low_actor_net() _high_critic_net = lambda: rls.critic_q_one( self.s_dim, self.sub_goal_dim, hidden_units['high_critic']) _low_critic_net = lambda: rls.critic_q_one( self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[ 'low_critic']) self.high_critic = DoubleQ(_high_critic_net) self.high_critic_target = DoubleQ(_high_critic_net) self.low_critic = DoubleQ(_low_critic_net) self.low_critic_target = DoubleQ(_low_critic_net) self.update_target_net_weights( self.low_actor_target.weights + self.low_critic_target.weights + self.high_actor_target.weights + self.high_critic_target.weights, self.low_actor.weights + self.low_critic.weights + self.high_actor.weights + self.high_critic.weights) self.low_actor_lr, self.low_critic_lr = map( self.init_lr, [low_actor_lr, low_critic_lr]) self.high_actor_lr, self.high_critic_lr = map( self.init_lr, [high_actor_lr, high_critic_lr]) self.low_actor_optimizer, self.low_critic_optimizer = map( self.init_optimizer, [self.low_actor_lr, self.low_critic_lr]) self.high_actor_optimizer, self.high_critic_optimizer = map( self.init_optimizer, [self.high_actor_lr, self.high_critic_lr]) self.model_recorder( dict(high_actor=self.high_actor, high_critic=self.high_critic, low_actor=self.low_actor, low_critic=self.low_critic, low_actor_optimizer=self.low_actor_optimizer, low_critic_optimizer=self.low_critic_optimizer, high_actor_optimizer=self.high_actor_optimizer, high_critic_optimizer=self.high_critic_optimizer)) self.counts = 0 self._high_s = [[] for _ in range(self.n_agents)] self._noop_subgoal = np.random.uniform(-self.high_scale, self.high_scale, size=(self.n_agents, self.sub_goal_dim)) self.get_ir = self.generate_ir_func(mode=intrinsic_reward_mode) def generate_ir_func(self, mode='os'): if mode == 'os': return lambda last_feat, subgoal, feat: -tf.norm( last_feat + subgoal - feat, ord=2, axis=-1, keepdims=True) elif mode == 'cos': return lambda last_feat, subgoal, feat: tf.expand_dims( -tf.keras.losses.cosine_similarity( tf.cast(feat - last_feat, tf.float32), tf.cast(subgoal, tf.float32), axis=-1), axis=-1) def show_logo(self): self.recorder.logger.info(''' xxxxx xxxxx xxxx xxxxxxx xxxxxx xx xx xx xxxxxxx xxx xxxx xx xx xx xx xxx xxx xxx xx xx xx xx xxx xx xxx xxxxxxx xx xxxxxx xx xxx xx xx xx xxxxxx xx xxx xx xx xx xx xxxx xx xxx xx xx xx xx xxx xxx xxx xxxxx xxxxx xxxx xxxxx xxxx xxxxxxx ''') def store_high_buffer(self, i): eps_len = len(self._high_s[i]) intervals = list(range(0, eps_len, self.sub_goal_steps)) if len(intervals) < 1: return left = intervals[:-1] right = intervals[1:] s, r, a, g, d, s_ = [], [], [], [], [], [] for _l, _r in zip(left, right): s.append(self._high_s[i][_l:_r]) r.append(sum(self._high_r[i][_l:_r]) * self.reward_scale) a.append(self._high_a[i][_l:_r]) g.append(self._subgoals[i][_l]) d.append(self._done[i][_r - 1]) s_.append(self._high_s_[i][_r - 1]) right = intervals[-1] s.append(self._high_s[i][right:eps_len] + [self._high_s[i][-1]] * (self.sub_goal_steps + right - eps_len)) r.append(sum(self._high_r[i][right:eps_len])) a.append(self._high_a[i][right:eps_len] + [self._high_a[i][-1]] * (self.sub_goal_steps + right - eps_len)) g.append(self._subgoals[i][right]) d.append(self._done[i][-1]) s_.append(self._high_s_[i][-1]) self.data_high.add(np.array(s), np.array(r)[:, np.newaxis], np.array(a), np.array(g), np.array(d)[:, np.newaxis], np.array(s_)) def reset(self): self._c = np.full((self.n_agents, 1), self.sub_goal_steps, np.int32) for i in range(self.n_agents): self.store_high_buffer(i) self._high_r = [[] for _ in range(self.n_agents)] self._high_a = [[] for _ in range(self.n_agents)] self._high_s = [[] for _ in range(self.n_agents)] self._subgoals = [[] for _ in range(self.n_agents)] self._done = [[] for _ in range(self.n_agents)] self._high_s_ = [[] for _ in range(self.n_agents)] self._new_subgoal = np.zeros((self.n_agents, self.sub_goal_dim), dtype=np.float32) def partial_reset(self, done): self._c = np.where( done[:, np.newaxis], np.full((self.n_agents, 1), self.sub_goal_steps, np.int32), self._c) idx = np.where(done)[0] for i in idx: self.store_high_buffer(i) self._high_s[i] = [] self._high_a[i] = [] self._high_s_[i] = [] self._high_r[i] = [] self._done[i] = [] self._subgoals[i] = [] @tf.function def _get_action(self, s, visual_s, subgoal): with tf.device(self.device): feat = tf.concat([s, subgoal], axis=-1) if self.is_continuous: mu = self.low_actor(feat) pi = tf.clip_by_value(mu + self.low_noise(), -1, 1) else: logits = self.low_actor(feat) mu = tf.argmax(logits, axis=1) cate_dist = tfd.Categorical(logits) pi = cate_dist.sample() return mu, pi def choose_action(self, s, visual_s, evaluation=False): self._subgoal = np.where(self._c == self.sub_goal_steps, self.get_subgoal(s).numpy(), self._new_subgoal) mu, pi = self._get_action(s, visual_s, self._subgoal) a = mu.numpy() if evaluation else pi.numpy() return a @tf.function def get_subgoal(self, s): ''' last_s 上一个隐状态 subgoal 上一个子目标 s 当前隐状态 ''' new_subgoal = self.high_scale * self.high_actor(s) new_subgoal = tf.clip_by_value(new_subgoal + self.high_noise(), -self.high_scale, self.high_scale) return new_subgoal def learn(self, **kwargs): self.episode = kwargs['episode'] for i in range(kwargs['step']): if self.data_low.is_lg_batch_size and self.data_high.is_lg_batch_size: self.intermediate_variable_reset() low_data = self.get_transitions( self.data_low, data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_']) high_data = self.get_transitions( self.data_high, data_name_list=['s', 'r', 'a', 'g', 'done', 's_']) # --------------------------------------获取需要传给train函数的参数 _low_training_data = self.get_value_from_dict( data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_'], data_dict=low_data) _high_training_data = self.get_value_from_dict( data_name_list=['s', 'r', 'a', 'g', 'done', 's_'], data_dict=high_data) summaries = self.train_low(_low_training_data) self.summaries.update(summaries) self.update_target_net_weights( self.low_actor_target.weights + self.low_critic_target.weights, self.low_actor.weights + self.low_critic.weights, self.ployak) if self.counts % self.sub_goal_steps == 0: self.counts = 0 high_summaries = self.train_high(_high_training_data) self.summaries.update(high_summaries) self.update_target_net_weights( self.high_actor_target.weights + self.high_critic_target.weights, self.high_actor.weights + self.high_critic.weights, self.ployak) self.counts += 1 self.summaries.update( dict([[ 'LEARNING_RATE/low_actor_lr', self.low_actor_lr(self.episode) ], [ 'LEARNING_RATE/low_critic_lr', self.low_critic_lr(self.episode) ], [ 'LEARNING_RATE/high_actor_lr', self.high_actor_lr(self.episode) ], [ 'LEARNING_RATE/high_critic_lr', self.high_critic_lr(self.episode) ]])) self.write_training_summaries(self.global_step, self.summaries) @tf.function(experimental_relax_shapes=True) def train_low(self, memories): s, a, r, s_, done, g, g_ = memories with tf.device(self.device): with tf.GradientTape() as tape: feat = tf.concat([s, g], axis=-1) feat_ = tf.concat([s_, g_], axis=-1) if self.is_continuous: target_mu = self.low_actor_target(feat_) action_target = tf.clip_by_value( target_mu + self.low_noise(), -1, 1) else: target_logits = self.low_actor_target(feat_) logp_all = tf.nn.log_softmax(target_logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [tf.shape(feat_)[0], self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax((logp_all + gumbel_noise) / 1.) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) action_target = _pi_diff + _pi q1, q2 = self.low_critic(feat, a) q = tf.minimum(q1, q2) q_target = self.low_critic_target.get_min(feat_, action_target) dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done)) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1)) q2_loss = tf.reduce_mean(tf.square(td_error2)) low_critic_loss = q1_loss + q2_loss low_critic_grads = tape.gradient(low_critic_loss, self.low_critic.weights) self.low_critic_optimizer.apply_gradients( zip(low_critic_grads, self.low_critic.weights)) with tf.GradientTape() as tape: if self.is_continuous: mu = self.low_actor(feat) else: logits = self.low_actor(feat) _pi = tf.nn.softmax(logits) _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1), self.a_dim, dtype=tf.float32) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) mu = _pi_diff + _pi q_actor = self.low_critic.Q1(feat, mu) low_actor_loss = -tf.reduce_mean(q_actor) low_actor_grads = tape.gradient(low_actor_loss, self.low_actor.trainable_variables) self.low_actor_optimizer.apply_gradients( zip(low_actor_grads, self.low_actor.trainable_variables)) self.global_step.assign_add(1) return dict([['LOSS/low_actor_loss', low_actor_loss], ['LOSS/low_critic_loss', low_critic_loss], ['Statistics/low_q_min', tf.reduce_min(q)], ['Statistics/low_q_mean', tf.reduce_mean(q)], ['Statistics/low_q_max', tf.reduce_max(q)]]) @tf.function(experimental_relax_shapes=True) def train_high(self, memories): # s_ : [B, N] ss, r, aa, g, done, s_ = memories batchs = tf.shape(ss)[0] # ss, aa [B, T, *] with tf.device(self.device): with tf.GradientTape() as tape: s = ss[:, 0] # [B, N] true_end = (s_ - s)[:, self.fn_goal_dim:] g_dist = tfd.Normal(loc=true_end, scale=0.5 * self.high_scale[None, :]) ss = tf.expand_dims(ss, 0) # [1, B, T, *] ss = tf.tile(ss, [self.sample_g_nums, 1, 1, 1]) # [10, B, T, *] ss = tf.reshape(ss, [-1, tf.shape(ss)[-1]]) # [10*B*T, *] aa = tf.expand_dims(aa, 0) # [1, B, T, *] aa = tf.tile(aa, [self.sample_g_nums, 1, 1, 1]) # [10, B, T, *] aa = tf.reshape(aa, [-1, tf.shape(aa)[-1]]) # [10*B*T, *] gs = tf.concat([ tf.expand_dims(g, 0), tf.expand_dims(true_end, 0), tf.clip_by_value(g_dist.sample(self.sample_g_nums - 2), -self.high_scale, self.high_scale) ], axis=0) # [10, B, N] all_g = gs + s[:, self.fn_goal_dim:] all_g = tf.expand_dims(all_g, 2) # [10, B, 1, N] all_g = tf.tile( all_g, [1, 1, self.sub_goal_steps, 1]) # [10, B, T, N] all_g = tf.reshape(all_g, [-1, tf.shape(all_g)[-1]]) # [10*B*T, N] all_g = all_g - ss[:, self.fn_goal_dim:] # [10*B*T, N] feat = tf.concat([ss, all_g], axis=-1) # [10*B*T, *] _aa = self.low_actor(feat) # [10*B*T, A] if not self.is_continuous: _aa = tf.one_hot(tf.argmax(_aa, axis=-1), self.a_dim, dtype=tf.float32) diff = _aa - aa diff = tf.reshape( diff, [self.sample_g_nums, batchs, self.sub_goal_steps, -1 ]) # [10, B, T, A] diff = tf.transpose(diff, [1, 0, 2, 3]) # [B, 10, T, A] logps = -0.5 * tf.reduce_sum(tf.norm(diff, ord=2, axis=-1)**2, axis=-1) # [B, 10] idx = tf.argmax(logps, axis=-1, output_type=tf.int32) idx = tf.stack([tf.range(batchs), idx], axis=1) # [B, 2] g = tf.gather_nd(tf.transpose(gs, [1, 0, 2]), idx) # [B, N] q1, q2 = self.high_critic(s, g) q = tf.minimum(q1, q2) target_sub_goal = self.high_actor_target(s_) * self.high_scale target_sub_goal = tf.clip_by_value( target_sub_goal + self.high_noise(), -self.high_scale, self.high_scale) q_target = self.high_critic_target.get_min(s_, target_sub_goal) dc_r = tf.stop_gradient(r + self.gamma * (1 - done) * q_target) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1)) q2_loss = tf.reduce_mean(tf.square(td_error2)) high_critic_loss = q1_loss + q2_loss high_critic_grads = tape.gradient(high_critic_loss, self.high_critic.weights) self.high_critic_optimizer.apply_gradients( zip(high_critic_grads, self.high_critic.weights)) with tf.GradientTape() as tape: mu = self.high_actor(s) * self.high_scale q_actor = self.high_critic.Q1(s, mu) high_actor_loss = -tf.reduce_mean(q_actor) high_actor_grads = tape.gradient( high_actor_loss, self.high_actor.trainable_variables) self.high_actor_optimizer.apply_gradients( zip(high_actor_grads, self.high_actor.trainable_variables)) return dict([['LOSS/high_actor_loss', high_actor_loss], ['LOSS/high_critic_loss', high_critic_loss], ['Statistics/high_q_min', tf.reduce_min(q)], ['Statistics/high_q_mean', tf.reduce_mean(q)], ['Statistics/high_q_max', tf.reduce_max(q)]]) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance(a, np.ndarray), "store need action type is np.ndarray" assert isinstance(r, np.ndarray), "store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "store need done type is np.ndarray" [o.append(_s) for o, _s in zip(self._high_s, s)] [o.append(_a) for o, _a in zip(self._high_a, a)] [o.append(_r) for o, _r in zip(self._high_r, r)] [o.append(_s_) for o, _s_ in zip(self._high_s_, s_)] [o.append(_d) for o, _d in zip(self._done, done)] [ o.append(_subgoal) for o, _subgoal in zip(self._subgoals, self._noop_subgoal) ] ir = self.get_ir(s[:, self.fn_goal_dim:], self._noop_subgoal, s_[:, self.fn_goal_dim:]) # subgoal = s[:, self.fn_goal_dim:] + self._noop_subgoal - s_[:, self.fn_goal_dim:] subgoal = np.random.uniform(-self.high_scale, self.high_scale, size=(self.n_agents, self.sub_goal_dim)) self.data_low.add( s, a, ir, s_, done[:, np.newaxis], # 升维 self._noop_subgoal, subgoal) self._noop_subgoal = subgoal def store_data(self, s, visual_s, a, r, s_, visual_s_, done): """ for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer. """ assert isinstance(a, np.ndarray), "store need action type is np.ndarray" assert isinstance(r, np.ndarray), "store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "store need done type is np.ndarray" [o.append(_s) for o, _s in zip(self._high_s, s)] [o.append(_a) for o, _a in zip(self._high_a, a)] [o.append(_r) for o, _r in zip(self._high_r, r)] [o.append(_s_) for o, _s_ in zip(self._high_s_, s_)] [o.append(_d) for o, _d in zip(self._done, done)] [ o.append(_subgoal) for o, _subgoal in zip(self._subgoals, self._subgoal) ] ir = self.get_ir(s[:, self.fn_goal_dim:], self._subgoal, s_[:, self.fn_goal_dim:]) self._new_subgoal = np.where( self._c == 1, self.get_subgoal(s_).numpy(), s[:, self.fn_goal_dim:] + self._subgoal - s_[:, self.fn_goal_dim:]) self.data_low.add( s, a, ir, s_, done[:, np.newaxis], # 升维 self._subgoal, self._new_subgoal) self._c = np.where( self._c == 1, np.full((self.n_agents, 1), self.sub_goal_steps, np.int32), self._c - 1) def get_transitions(self, databuffer, data_name_list=['s', 'a', 'r', 's_', 'done']): ''' TODO: Annotation ''' data = databuffer.sample() # 经验池取数据 if not self.is_continuous and 'a' in data_name_list: a_idx = data_name_list.index('a') a = data[a_idx].astype(np.int32) pre_shape = a.shape a = a.reshape(-1) a = sth.int2one_hot(a, self.a_dim) a = a.reshape(pre_shape + (-1, )) data[a_idx] = a return dict([[ n, d ] for n, d in zip(data_name_list, list(map(self.data_convert, data)))])
def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, annealing=True, last_alpha=0.01, ployak=0.995, discrete_tau=1.0, log_std_bound=[-20, 2], hidden_units={ 'actor_continuous': { 'share': [128, 128], 'mu': [64], 'log_std': [64] }, 'actor_discrete': [64, 32], 'q': [128, 128], 'encoder': 128 }, auto_adaption=True, actor_lr=5.0e-4, critic_lr=1.0e-3, alpha_lr=5.0e-4, curl_lr=5.0e-4, img_size=64, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) assert self.visual_sources == 1 self.ployak = ployak self.discrete_tau = discrete_tau self.log_std_min, self.log_std_max = log_std_bound[:] self.auto_adaption = auto_adaption self.annealing = annealing self.img_size = img_size self.img_dim = [img_size, img_size, self.visual_dim[-1]] self.vis_feat_size = hidden_units['encoder'] if self.auto_adaption: self.log_alpha = tf.Variable(initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) else: self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha), name='log_alpha', dtype=tf.float32, trainable=False) if self.annealing: self.alpha_annealing = LinearAnnealing(alpha, last_alpha, 1.0e6) if self.is_continuous: self.actor_net = rls.actor_continuous( self.s_dim + self.vis_feat_size, self.a_dim, hidden_units['actor_continuous']) else: self.actor_net = rls.actor_discrete( self.s_dim + self.vis_feat_size, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_tv = self.actor_net.trainable_variables # entropy = -log(1/|A|) = log |A| self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else np.log(self.a_dim)) def _q_net(): return rls.critic_q_one(self.s_dim + self.vis_feat_size, self.a_dim, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.encoder = VisualEncoder(self.img_dim, hidden_units['encoder']) self.encoder_target = VisualEncoder(self.img_dim, hidden_units['encoder']) self.curl_w = tf.Variable( initial_value=tf.random.normal(shape=(self.vis_feat_size, self.vis_feat_size)), name='curl_w', dtype=tf.float32, trainable=True) self.critic_tv = self.critic_net.trainable_variables + self.encoder.trainable_variables self.update_target_net_weights( self.critic_target_net.weights + self.encoder_target.trainable_variables, self.critic_net.weights + self.encoder.trainable_variables) self.actor_lr, self.critic_lr, self.alpha_lr, self.curl_lr = map( self.init_lr, [actor_lr, critic_lr, alpha_lr, curl_lr]) self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha, self.optimizer_curl = map( self.init_optimizer, [self.actor_lr, self.critic_lr, self.alpha_lr, self.curl_lr]) self.model_recorder( dict( actor=self.actor_net, critic_net=self.critic_net, curl_w=self.curl_w, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha, optimizer_curl=self.optimizer_curl, ))
class CURL(make_off_policy_class(mode='no_share')): """ CURL: Contrastive Unsupervised Representations for Reinforcement Learning, http://arxiv.org/abs/2004.04136 """ def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, annealing=True, last_alpha=0.01, ployak=0.995, discrete_tau=1.0, log_std_bound=[-20, 2], hidden_units={ 'actor_continuous': { 'share': [128, 128], 'mu': [64], 'log_std': [64] }, 'actor_discrete': [64, 32], 'q': [128, 128], 'encoder': 128 }, auto_adaption=True, actor_lr=5.0e-4, critic_lr=1.0e-3, alpha_lr=5.0e-4, curl_lr=5.0e-4, img_size=64, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) assert self.visual_sources == 1 self.ployak = ployak self.discrete_tau = discrete_tau self.log_std_min, self.log_std_max = log_std_bound[:] self.auto_adaption = auto_adaption self.annealing = annealing self.img_size = img_size self.img_dim = [img_size, img_size, self.visual_dim[-1]] self.vis_feat_size = hidden_units['encoder'] if self.auto_adaption: self.log_alpha = tf.Variable(initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) else: self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha), name='log_alpha', dtype=tf.float32, trainable=False) if self.annealing: self.alpha_annealing = LinearAnnealing(alpha, last_alpha, 1.0e6) if self.is_continuous: self.actor_net = rls.actor_continuous( self.s_dim + self.vis_feat_size, self.a_dim, hidden_units['actor_continuous']) else: self.actor_net = rls.actor_discrete( self.s_dim + self.vis_feat_size, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_tv = self.actor_net.trainable_variables # entropy = -log(1/|A|) = log |A| self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else np.log(self.a_dim)) def _q_net(): return rls.critic_q_one(self.s_dim + self.vis_feat_size, self.a_dim, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.encoder = VisualEncoder(self.img_dim, hidden_units['encoder']) self.encoder_target = VisualEncoder(self.img_dim, hidden_units['encoder']) self.curl_w = tf.Variable( initial_value=tf.random.normal(shape=(self.vis_feat_size, self.vis_feat_size)), name='curl_w', dtype=tf.float32, trainable=True) self.critic_tv = self.critic_net.trainable_variables + self.encoder.trainable_variables self.update_target_net_weights( self.critic_target_net.weights + self.encoder_target.trainable_variables, self.critic_net.weights + self.encoder.trainable_variables) self.actor_lr, self.critic_lr, self.alpha_lr, self.curl_lr = map( self.init_lr, [actor_lr, critic_lr, alpha_lr, curl_lr]) self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha, self.optimizer_curl = map( self.init_optimizer, [self.actor_lr, self.critic_lr, self.alpha_lr, self.curl_lr]) self.model_recorder( dict( actor=self.actor_net, critic_net=self.critic_net, curl_w=self.curl_w, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha, optimizer_curl=self.optimizer_curl, )) def show_logo(self): self.recorder.logger.info(''' xxxxxx xxxxx xxxx xxxxxxxx xxxxx xxx xx xx xx xx xxx x xx xx x x x xxx x xx x x x xx x xxx x x xxxxxx x xxx x x xx xxx x xx xx xx xx x xx x xx xxx xxx xx xx x xxx x xxx xxxxxx xxxxxxx xxxxx xxx xxxxxxxx ''') def choose_action(self, s, visual_s, evaluation=False): visual_s = center_crop_image(visual_s[:, 0], self.img_size) mu, pi = self._get_action(s, visual_s) a = mu.numpy() if evaluation else pi.numpy() return a @tf.function def _get_action(self, s, visual_s): with tf.device(self.device): feat = tf.concat([self.encoder(visual_s), s], axis=-1) if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, _ = squash_rsample(mu, log_std) mu = tf.tanh(mu) # squash mu else: logits = self.actor_net(feat) mu = tf.argmax(logits, axis=1) cate_dist = tfp.distributions.Categorical(logits) pi = cate_dist.sample() return mu, pi def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _train(memories, isw, crsty_loss, cell_state): td_error, summaries = self.train(memories, isw, crsty_loss, cell_state) if self.annealing and not self.auto_adaption: self.log_alpha.assign( tf.math.log( tf.cast(self.alpha_annealing(self.global_step.numpy()), tf.float32))) return td_error, summaries def _pre_process(data): data['visual_s'] = np.transpose(data['visual_s'][:, 0].numpy(), (0, 3, 1, 2)) data['visual_s_'] = np.transpose(data['visual_s_'][:, 0].numpy(), (0, 3, 1, 2)) data['pos'] = self.data_convert( np.transpose(random_crop(data['visual_s'], self.img_size), (0, 2, 3, 1))) data['visual_s'] = self.data_convert( np.transpose(random_crop(data['visual_s'], self.img_size), (0, 2, 3, 1))) data['visual_s_'] = self.data_convert( np.transpose(random_crop(data['visual_s_'], self.img_size), (0, 2, 3, 1))) return (data, ) for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': _train, 'update_function': lambda: self.update_target_net_weights( self.critic_target_net.weights + self.encoder_target. trainable_variables, self.critic_net.weights + self. encoder.trainable_variables, self.ployak), 'summary_dict': dict([[ 'LEARNING_RATE/actor_lr', self.actor_lr(self.train_step) ], [ 'LEARNING_RATE/critic_lr', self.critic_lr(self.train_step) ], [ 'LEARNING_RATE/alpha_lr', self.alpha_lr(self.train_step) ]]), 'train_data_list': [ 's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'pos' ], 'pre_process_function': _pre_process }) @property def alpha(self): return tf.exp(self.log_alpha) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): s, visual_s, a, r, s_, visual_s_, done, pos = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: vis_feat = self.encoder(visual_s) vis_feat_ = self.encoder(visual_s_) target_vis_feat_ = self.encoder_target(visual_s_) feat = tf.concat([vis_feat, s], axis=-1) feat_ = tf.concat([vis_feat_, s_], axis=-1) target_feat_ = tf.concat([target_vis_feat_, s_], axis=-1) if self.is_continuous: target_mu, target_log_std = self.actor_net(feat_) target_log_std = clip_nn_log_std(target_log_std) target_pi, target_log_pi = squash_rsample( target_mu, target_log_std) else: target_logits = self.actor_net(feat_) target_cate_dist = tfp.distributions.Categorical( target_logits) target_pi = target_cate_dist.sample() target_log_pi = target_cate_dist.log_prob(target_pi) target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32) q1, q2 = self.critic_net(feat, a) q1_target, q2_target = self.critic_target_net(feat_, target_pi) dc_r_q1 = tf.stop_gradient( r + self.gamma * (1 - done) * (q1_target - self.alpha * target_log_pi)) dc_r_q2 = tf.stop_gradient( r + self.gamma * (1 - done) * (q2_target - self.alpha * target_log_pi)) td_error1 = q1 - dc_r_q1 td_error2 = q2 - dc_r_q2 q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss z_a = vis_feat # [B, N] z_out = self.encoder_target(pos) logits = tf.matmul( z_a, tf.matmul(self.curl_w, tf.transpose(z_out, [1, 0]))) logits -= tf.reduce_max(logits, axis=-1, keepdims=True) curl_loss = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( tf.range(self.batch_size), logits)) critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) curl_grads = tape.gradient(curl_loss, [self.curl_w] + self.encoder.trainable_variables) self.optimizer_curl.apply_gradients( zip(curl_grads, [self.curl_w] + self.encoder.trainable_variables)) with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = squash_rsample(mu, log_std) entropy = gaussian_entropy(log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) pi = _pi_diff + _pi log_pi = tf.reduce_sum(tf.multiply(logp_all, pi), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) q_s_pi = self.critic_net.get_min(feat, pi) actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) if self.auto_adaption: with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) norm_dist = tfp.distributions.Normal( loc=mu, scale=tf.exp(log_std)) log_pi = tf.reduce_sum(norm_dist.log_prob( norm_dist.sample()), axis=-1) else: logits = self.actor_net(feat) cate_dist = tfp.distributions.Categorical(logits) log_pi = cate_dist.log_prob(cate_dist.sample()) alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(log_pi + self.target_entropy)) alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients([(alpha_grad, self.log_alpha)]) self.global_step.assign_add(1) summaries = dict( [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss], ['LOSS/curl_loss', curl_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', entropy], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))]]) if self.auto_adaption: summaries.update({'LOSS/alpha_loss': alpha_loss}) return (td_error1 + td_error2) / 2., summaries
class MAXSQN(make_off_policy_class(mode='share')): ''' https://github.com/createamind/DRL/blob/master/spinup/algos/maxsqn/maxsqn.py ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, beta=0.1, ployak=0.995, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, use_epsilon=False, q_lr=5.0e-4, alpha_lr=5.0e-4, auto_adaption=True, hidden_units=[32, 32], **kwargs): assert not is_continuous, 'maxsqn only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.use_epsilon = use_epsilon self.ployak = ployak self.log_alpha = alpha if not auto_adaption else tf.Variable( initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) self.auto_adaption = auto_adaption self.target_entropy = beta * np.log(self.a_dim) def _q_net(): return rls.critic_q_all(self.feat_dim, self.a_dim, hidden_units) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights) self.q_lr, self.alpha_lr = map(self.init_lr, [q_lr, alpha_lr]) self.optimizer_critic, self.optimizer_alpha = map( self.init_optimizer, [self.q_lr, self.alpha_lr]) self.model_recorder( dict(critic_net=self.critic_net, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha)) def show_logo(self): self.recorder.logger.info(''' xx xx xxxxxx xxxxxx xxxx xx xxx xxx xxx xxx xxxx xxx xxxx xx xxx xxx xxxxx x xx xx xx xx xxxxx xx xxxx xxx xxxxxx xx xxx xxxxxx xx xxx xx xxx xx xxxx xx x x xxx xxxxx xxxxxx xx xx xx xxxxx xxxx xx x xxxxxx xxx xxx xxx x xxx xx xxxx xx xxx x xxx xx xxx xx xx xx xxxxx xx xxxx xx xxx x xx xxx xxxxx xxxxxxxxx xxx xxxx xx xxx xx xxx x xxxxxxxx xxx xxx xxxxxxx xxxxxxx xx xx xxxxxxx ''') @property def alpha(self): return tf.exp(self.log_alpha) def choose_action(self, s, visual_s, evaluation=False): if self.use_epsilon and np.random.uniform( ) < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation): a = np.random.randint(0, self.a_dim, self.n_agents) else: mu, pi, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = pi.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) q = self.critic_net.Q1(feat) cate_dist = tfp.distributions.Categorical(logits=q / self.alpha) pi = cate_dist.sample() return tf.argmax(q, axis=1), pi, cell_state def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': self.train, 'update_function': lambda: self.update_target_net_weights( self.critic_target_net.weights, self.critic_net. weights, self.ployak), 'summary_dict': dict([['LEARNING_RATE/q_lr', self.q_lr(self.train_step)], [ 'LEARNING_RATE/alpha_lr', self.alpha_lr(self.train_step) ]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) q1, q2 = self.critic_net(feat) q1_eval = tf.reduce_sum(tf.multiply(q1, a), axis=1, keepdims=True) q2_eval = tf.reduce_sum(tf.multiply(q2, a), axis=1, keepdims=True) q1_target, q2_target = self.critic_target_net(feat_) q1_target_max = tf.reduce_max(q1_target, axis=1, keepdims=True) q1_target_log_probs = tf.nn.log_softmax(q1_target / self.alpha, axis=1) + 1e-8 q1_target_entropy = -tf.reduce_mean( tf.reduce_sum( tf.exp(q1_target_log_probs) * q1_target_log_probs, axis=1, keepdims=True)) q2_target_max = tf.reduce_max(q2_target, axis=1, keepdims=True) # q2_target_log_probs = tf.nn.log_softmax(q2_target, axis=1) # q2_target_log_max = tf.reduce_max(q2_target_log_probs, axis=1, keepdims=True) q_target = tf.minimum( q1_target_max, q2_target_max) + self.alpha * q1_target_entropy dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done)) td_error1 = q1_eval - dc_r td_error2 = q2_eval - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) loss = 0.5 * (q1_loss + q2_loss) + crsty_loss loss_grads = tape.gradient(loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(loss_grads, self.critic_tv)) if self.auto_adaption: with tf.GradientTape() as tape: q1 = self.critic_net.Q1(feat) q1_log_probs = tf.nn.log_softmax(q1 / self.alpha, axis=1) + 1e-8 q1_entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(q1_log_probs) * q1_log_probs, axis=1, keepdims=True)) alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(self.target_entropy - q1_entropy)) alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients([(alpha_grad, self.log_alpha)]) self.global_step.assign_add(1) summaries = dict( [['LOSS/loss', loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/q1_entropy', q1_entropy], ['Statistics/q_min', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(q1)], ['Statistics/q_max', tf.reduce_mean(tf.maximum(q1, q2))]]) if self.auto_adaption: summaries.update({'LOSS/alpha_loss': alpha_loss}) return (td_error1 + td_error2) / 2, summaries