class C51(make_off_policy_class(mode='share')): ''' Category 51, https://arxiv.org/abs/1707.06887 No double, no dueling, no noisy net. ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, v_min=-10, v_max=10, atoms=51, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, hidden_units=[128, 128], **kwargs): assert not is_continuous, 'c51 only support discrete action space' super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.v_min = v_min self.v_max = v_max self.atoms = atoms self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1) self.z = tf.reshape(tf.constant([self.v_min + i * self.delta_z for i in range(self.atoms)], dtype=tf.float32), [-1, self.atoms]) # [1, N] self.zb = tf.tile(self.z, tf.constant([self.a_dim, 1])) # [A, N] self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _net(): return NetWork(self.feat_dim, self.a_dim, self.atoms, hidden_units) self.q_dist_net = _net() self.q_target_dist_net = _net() self.critic_tv = self.q_dist_net.trainable_variables + self.other_tv update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict( model=self.q_dist_net, optimizer=self.optimizer )) def show_logo(self): self.logger.info(''' xxxxxxx xxxxx xxx xxxx xxx xxxx xxxx xxxx x xxxx xx xxx x xxxxx xx xxx xxx xx xxx xxx xx xxx xx xx xxx x xx xx xx xxxxxxxx xxxxx xxxx xxxxx x xxxx ''') def choose_action(self, s, visual_s, evaluation=False): if np.random.uniform() < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation): a = np.random.randint(0, self.a_dim, self.n_agents) else: a, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = a.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) q = self.get_q(feat) # [B, A] return tf.argmax(q, axis=-1), cell_state # [B, 1] def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _update(): if self.global_step % self.assign_interval == 0: update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights) for i in range(self.train_times_per_step): self._learn(function_dict={ 'train_function': self.train, 'update_function': _update, 'summary_dict': dict([['LEARNING_RATE/lr', self.lr(self.train_step)]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) indexs = tf.reshape(tf.range(batch_size), [-1, 1]) # [B, 1] q_dist = self.q_dist_net(feat) # [B, A, N] q_dist = tf.transpose(tf.reduce_sum(tf.transpose(q_dist, [2, 0, 1]) * a, axis=-1), [1, 0]) # [B, N] q_eval = tf.reduce_sum(q_dist * self.z, axis=-1) target_q_dist = self.q_target_dist_net(feat_) # [B, A, N] target_q = tf.reduce_sum(self.zb * target_q_dist, axis=-1) # [B, A, N] => [B, A] a_ = tf.reshape(tf.cast(tf.argmax(target_q, axis=-1), dtype=tf.int32), [-1, 1]) # [B, 1] target_q_dist = tf.gather_nd(target_q_dist, tf.concat([indexs, a_], axis=-1)) # [B, N] target = tf.tile(r, tf.constant([1, self.atoms])) \ + self.gamma * tf.multiply(self.z, # [1, N] (1.0 - tf.tile(done, tf.constant([1, self.atoms])))) # [B, N], [1, N]* [B, N] = [B, N] target = tf.clip_by_value(target, self.v_min, self.v_max) # [B, N] b = (target - self.v_min) / self.delta_z # [B, N] u, l = tf.math.ceil(b), tf.math.floor(b) # [B, N] u_id, l_id = tf.cast(u, tf.int32), tf.cast(l, tf.int32) # [B, N] u_minus_b, b_minus_l = u - b, b - l # [B, N] index_help = tf.tile(indexs, tf.constant([1, self.atoms])) # [B, N] index_help = tf.expand_dims(index_help, -1) # [B, N, 1] u_id = tf.concat([index_help, tf.expand_dims(u_id, -1)], axis=-1) # [B, N, 2] l_id = tf.concat([index_help, tf.expand_dims(l_id, -1)], axis=-1) # [B, N, 2] _cross_entropy = tf.stop_gradient(target_q_dist * u_minus_b) * tf.math.log(tf.gather_nd(q_dist, l_id)) \ + tf.stop_gradient(target_q_dist * b_minus_l) * tf.math.log(tf.gather_nd(q_dist, u_id)) # [B, N] # tf.debugging.check_numerics(_cross_entropy, '_cross_entropy') cross_entropy = -tf.reduce_sum(_cross_entropy, axis=-1) # [B,] # tf.debugging.check_numerics(cross_entropy, 'cross_entropy') loss = tf.reduce_mean(cross_entropy * isw) + crsty_loss td_error = cross_entropy grads = tape.gradient(loss, self.critic_tv) self.optimizer.apply_gradients( zip(grads, self.critic_tv) ) self.global_step.assign_add(1) return td_error, dict([ ['LOSS/loss', loss], ['Statistics/q_max', tf.reduce_max(q_eval)], ['Statistics/q_min', tf.reduce_min(q_eval)], ['Statistics/q_mean', tf.reduce_mean(q_eval)] ]) @tf.function(experimental_relax_shapes=True) def get_q(self, feat): with tf.device(self.device): return tf.reduce_sum(self.zb * self.q_dist_net(feat), axis=-1) # [B, A, N] => [B, A]
class CURL(make_off_policy_class(mode='no_share')): """ CURL: Contrastive Unsupervised Representations for Reinforcement Learning, http://arxiv.org/abs/2004.04136 """ def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, annealing=True, last_alpha=0.01, ployak=0.995, discrete_tau=1.0, log_std_bound=[-20, 2], hidden_units={ 'actor_continuous': { 'share': [128, 128], 'mu': [64], 'log_std': [64] }, 'actor_discrete': [64, 32], 'q': [128, 128], 'encoder': 128 }, auto_adaption=True, actor_lr=5.0e-4, critic_lr=1.0e-3, alpha_lr=5.0e-4, curl_lr=5.0e-4, img_size=64, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) assert self.visual_sources == 1 self.ployak = ployak self.discrete_tau = discrete_tau self.log_std_min, self.log_std_max = log_std_bound[:] self.auto_adaption = auto_adaption self.annealing = annealing self.img_size = img_size self.img_dim = [img_size, img_size, self.visual_dim[-1]] self.vis_feat_size = hidden_units['encoder'] if self.auto_adaption: self.log_alpha = tf.Variable(initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) else: self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha), name='log_alpha', dtype=tf.float32, trainable=False) if self.annealing: self.alpha_annealing = LinearAnnealing(alpha, last_alpha, 1.0e6) if self.is_continuous: self.actor_net = ActorCts(self.s_dim + self.vis_feat_size, self.a_dim, hidden_units['actor_continuous']) else: self.actor_net = ActorDcs(self.s_dim + self.vis_feat_size, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_tv = self.actor_net.trainable_variables # entropy = -log(1/|A|) = log |A| self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else np.log(self.a_dim)) def _q_net(): return Critic(self.s_dim + self.vis_feat_size, self.a_dim, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.encoder = VisualEncoder(self.img_dim, hidden_units['encoder']) self.encoder_target = VisualEncoder(self.img_dim, hidden_units['encoder']) self.curl_w = tf.Variable( initial_value=tf.random.normal(shape=(self.vis_feat_size, self.vis_feat_size)), name='curl_w', dtype=tf.float32, trainable=True) self.critic_tv = self.critic_net.trainable_variables + self.encoder.trainable_variables update_target_net_weights( self.critic_target_net.weights + self.encoder_target.trainable_variables, self.critic_net.weights + self.encoder.trainable_variables) self.actor_lr, self.critic_lr, self.alpha_lr, self.curl_lr = map( self.init_lr, [actor_lr, critic_lr, alpha_lr, curl_lr]) self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha, self.optimizer_curl = map( self.init_optimizer, [self.actor_lr, self.critic_lr, self.alpha_lr, self.curl_lr]) self.model_recorder( dict( actor=self.actor_net, critic_net=self.critic_net, curl_w=self.curl_w, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha, optimizer_curl=self.optimizer_curl, )) def show_logo(self): self.logger.info(''' xxxxxx xxxxx xxxx xxxxxxxx xxxxx xxx xx xx xx xx xxx x xx xx x x x xxx x xx x x x xx x xxx x x xxxxxx x xxx x x xx xxx x xx xx xx xx x xx x xx xxx xxx xx xx x xxx x xxx xxxxxx xxxxxxx xxxxx xxx xxxxxxxx ''') def choose_action(self, s, visual_s, evaluation=False): visual_s = center_crop_image(visual_s[:, 0], self.img_size) mu, pi = self._get_action(s, visual_s) a = mu.numpy() if evaluation else pi.numpy() return a @tf.function def _get_action(self, s, visual_s): with tf.device(self.device): feat = tf.concat([self.encoder(visual_s), s], axis=-1) if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, _ = squash_rsample(mu, log_std) mu = tf.tanh(mu) # squash mu else: logits = self.actor_net(feat) mu = tf.argmax(logits, axis=1) cate_dist = tfp.distributions.Categorical(logits) pi = cate_dist.sample() return mu, pi def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _train(memories, isw, crsty_loss, cell_state): td_error, summaries = self.train(memories, isw, crsty_loss, cell_state) if self.annealing and not self.auto_adaption: self.log_alpha.assign( tf.math.log( tf.cast(self.alpha_annealing(self.global_step.numpy()), tf.float32))) return td_error, summaries def _pre_process(data): data['visual_s'] = np.transpose(data['visual_s'][:, 0].numpy(), (0, 3, 1, 2)) data['visual_s_'] = np.transpose(data['visual_s_'][:, 0].numpy(), (0, 3, 1, 2)) data['pos'] = self.data_convert( np.transpose(random_crop(data['visual_s'], self.img_size), (0, 2, 3, 1))) data['visual_s'] = self.data_convert( np.transpose(random_crop(data['visual_s'], self.img_size), (0, 2, 3, 1))) data['visual_s_'] = self.data_convert( np.transpose(random_crop(data['visual_s_'], self.img_size), (0, 2, 3, 1))) return (data, ) for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': _train, 'update_function': lambda: update_target_net_weights( self.critic_target_net.weights + self.encoder_target. trainable_variables, self.critic_net.weights + self. encoder.trainable_variables, self.ployak), 'summary_dict': dict([[ 'LEARNING_RATE/actor_lr', self.actor_lr(self.train_step) ], [ 'LEARNING_RATE/critic_lr', self.critic_lr(self.train_step) ], [ 'LEARNING_RATE/alpha_lr', self.alpha_lr(self.train_step) ]]), 'train_data_list': [ 's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'pos' ], 'pre_process_function': _pre_process }) @property def alpha(self): return tf.exp(self.log_alpha) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): s, visual_s, a, r, s_, visual_s_, done, pos = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: vis_feat = self.encoder(visual_s) vis_feat_ = self.encoder(visual_s_) target_vis_feat_ = self.encoder_target(visual_s_) feat = tf.concat([vis_feat, s], axis=-1) feat_ = tf.concat([vis_feat_, s_], axis=-1) target_feat_ = tf.concat([target_vis_feat_, s_], axis=-1) if self.is_continuous: target_mu, target_log_std = self.actor_net(feat_) target_log_std = clip_nn_log_std(target_log_std) target_pi, target_log_pi = squash_rsample( target_mu, target_log_std) else: target_logits = self.actor_net(feat_) target_cate_dist = tfp.distributions.Categorical( target_logits) target_pi = target_cate_dist.sample() target_log_pi = target_cate_dist.log_prob(target_pi) target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32) q1, q2 = self.critic_net(feat, a) q1_target, q2_target = self.critic_target_net(feat_, target_pi) dc_r_q1 = tf.stop_gradient( r + self.gamma * (1 - done) * (q1_target - self.alpha * target_log_pi)) dc_r_q2 = tf.stop_gradient( r + self.gamma * (1 - done) * (q2_target - self.alpha * target_log_pi)) td_error1 = q1 - dc_r_q1 td_error2 = q2 - dc_r_q2 q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss z_a = vis_feat # [B, N] z_out = self.encoder_target(pos) logits = tf.matmul( z_a, tf.matmul(self.curl_w, tf.transpose(z_out, [1, 0]))) logits -= tf.reduce_max(logits, axis=-1, keepdims=True) curl_loss = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( tf.range(self.batch_size), logits)) critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) curl_grads = tape.gradient(curl_loss, [self.curl_w] + self.encoder.trainable_variables) self.optimizer_curl.apply_gradients( zip(curl_grads, [self.curl_w] + self.encoder.trainable_variables)) with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = squash_rsample(mu, log_std) entropy = gaussian_entropy(log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) pi = _pi_diff + _pi log_pi = tf.reduce_sum(tf.multiply(logp_all, pi), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) q_s_pi = self.critic_net.get_min(feat, pi) actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) if self.auto_adaption: with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) norm_dist = tfp.distributions.Normal( loc=mu, scale=tf.exp(log_std)) log_pi = tf.reduce_sum(norm_dist.log_prob( norm_dist.sample()), axis=-1) else: logits = self.actor_net(feat) cate_dist = tfp.distributions.Categorical(logits) log_pi = cate_dist.log_prob(cate_dist.sample()) alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(log_pi + self.target_entropy)) alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients([(alpha_grad, self.log_alpha)]) self.global_step.assign_add(1) summaries = dict( [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss], ['LOSS/curl_loss', curl_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', entropy], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))]]) if self.auto_adaption: summaries.update({'LOSS/alpha_loss': alpha_loss}) return (td_error1 + td_error2) / 2., summaries
class MAXSQN(make_off_policy_class(mode='share')): ''' https://github.com/createamind/DRL/blob/master/spinup/algos/maxsqn/maxsqn.py ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, beta=0.1, ployak=0.995, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, use_epsilon=False, q_lr=5.0e-4, alpha_lr=5.0e-4, auto_adaption=True, hidden_units=[32, 32], **kwargs): assert not is_continuous, 'maxsqn only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.use_epsilon = use_epsilon self.ployak = ployak self.log_alpha = alpha if not auto_adaption else tf.Variable( initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) self.auto_adaption = auto_adaption self.target_entropy = beta * np.log(self.a_dim) def _q_net(): return Critic(self.feat_dim, self.a_dim, hidden_units) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights) self.q_lr, self.alpha_lr = map(self.init_lr, [q_lr, alpha_lr]) self.optimizer_critic, self.optimizer_alpha = map( self.init_optimizer, [self.q_lr, self.alpha_lr]) self.model_recorder( dict(critic_net=self.critic_net, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha)) def show_logo(self): self.logger.info(''' xx xx xxxxxx xxxxxx xxxx xx xxx xxx xxx xxx xxxx xxx xxxx xx xxx xxx xxxxx x xx xx xx xx xxxxx xx xxxx xxx xxxxxx xx xxx xxxxxx xx xxx xx xxx xx xxxx xx x x xxx xxxxx xxxxxx xx xx xx xxxxx xxxx xx x xxxxxx xxx xxx xxx x xxx xx xxxx xx xxx x xxx xx xxx xx xx xx xxxxx xx xxxx xx xxx x xx xxx xxxxx xxxxxxxxx xxx xxxx xx xxx xx xxx x xxxxxxxx xxx xxx xxxxxxx xxxxxxx xx xx xxxxxxx ''') @property def alpha(self): return tf.exp(self.log_alpha) def choose_action(self, s, visual_s, evaluation=False): if self.use_epsilon and np.random.uniform( ) < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation): a = np.random.randint(0, self.a_dim, self.n_agents) else: mu, pi, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = pi.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) q = self.critic_net.Q1(feat) cate_dist = tfp.distributions.Categorical(logits=q / self.alpha) pi = cate_dist.sample() return tf.argmax(q, axis=1), pi, cell_state def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': self.train, 'update_function': lambda: update_target_net_weights( self.critic_target_net.weights, self.critic_net. weights, self.ployak), 'summary_dict': dict([['LEARNING_RATE/q_lr', self.q_lr(self.train_step)], [ 'LEARNING_RATE/alpha_lr', self.alpha_lr(self.train_step) ]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) q1, q2 = self.critic_net(feat) q1_eval = tf.reduce_sum(tf.multiply(q1, a), axis=1, keepdims=True) q2_eval = tf.reduce_sum(tf.multiply(q2, a), axis=1, keepdims=True) q1_target, q2_target = self.critic_target_net(feat_) q1_target_max = tf.reduce_max(q1_target, axis=1, keepdims=True) q1_target_log_probs = tf.nn.log_softmax(q1_target / self.alpha, axis=1) + 1e-8 q1_target_entropy = -tf.reduce_mean( tf.reduce_sum( tf.exp(q1_target_log_probs) * q1_target_log_probs, axis=1, keepdims=True)) q2_target_max = tf.reduce_max(q2_target, axis=1, keepdims=True) # q2_target_log_probs = tf.nn.log_softmax(q2_target, axis=1) # q2_target_log_max = tf.reduce_max(q2_target_log_probs, axis=1, keepdims=True) q_target = tf.minimum( q1_target_max, q2_target_max) + self.alpha * q1_target_entropy dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done)) td_error1 = q1_eval - dc_r td_error2 = q2_eval - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) loss = 0.5 * (q1_loss + q2_loss) + crsty_loss loss_grads = tape.gradient(loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(loss_grads, self.critic_tv)) if self.auto_adaption: with tf.GradientTape() as tape: q1 = self.critic_net.Q1(feat) q1_log_probs = tf.nn.log_softmax(q1 / self.alpha, axis=1) + 1e-8 q1_entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(q1_log_probs) * q1_log_probs, axis=1, keepdims=True)) alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(self.target_entropy - q1_entropy)) alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients([(alpha_grad, self.log_alpha)]) self.global_step.assign_add(1) summaries = dict( [['LOSS/loss', loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/q1_entropy', q1_entropy], ['Statistics/q_min', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(q1)], ['Statistics/q_max', tf.reduce_mean(tf.maximum(q1, q2))]]) if self.auto_adaption: summaries.update({'LOSS/alpha_loss': alpha_loss}) return (td_error1 + td_error2) / 2, summaries
class TAC(make_off_policy_class(mode='share')): """Tsallis Actor Critic, TAC with V neural Network. https://arxiv.org/abs/1902.00137 """ def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, alpha=0.2, annealing=True, last_alpha=0.01, ployak=0.995, entropic_index=1.5, discrete_tau=1.0, log_std_bound=[-20, 2], hidden_units={ 'actor_continuous': { 'share': [128, 128], 'mu': [64], 'log_std': [64] }, 'actor_discrete': [64, 32], 'q': [128, 128] }, auto_adaption=True, actor_lr=5.0e-4, critic_lr=1.0e-3, alpha_lr=5.0e-4, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak self.discrete_tau = discrete_tau self.entropic_index = 2 - entropic_index self.log_std_min, self.log_std_max = log_std_bound[:] self.auto_adaption = auto_adaption self.annealing = annealing if self.auto_adaption: self.log_alpha = tf.Variable(initial_value=0.0, name='log_alpha', dtype=tf.float32, trainable=True) else: self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha), name='log_alpha', dtype=tf.float32, trainable=False) if self.annealing: self.alpha_annealing = LinearAnnealing(alpha, last_alpha, 1e6) if self.is_continuous: self.actor_net = ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) else: self.actor_net = ActorDcs(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_tv = self.actor_net.trainable_variables # entropy = -log(1/|A|) = log |A| self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else np.log(self.a_dim)) def _q_net(): return CriticQ1(self.feat_dim, self.a_dim, hidden_units['q']) self.critic_net = DoubleQ(_q_net) self.critic_target_net = DoubleQ(_q_net) self.critic_tv = self.critic_net.trainable_variables + self.other_tv update_target_net_weights(self.critic_target_net.weights, self.critic_net.weights) self.actor_lr, self.critic_lr, self.alpha_lr = map( self.init_lr, [actor_lr, critic_lr, alpha_lr]) self.optimizer_actor, self.optimizer_critic, self.optimizer_alpha = map( self.init_optimizer, [self.actor_lr, self.critic_lr, self.alpha_lr]) self.model_recorder( dict( actor=self.actor_net, critic_net=self.critic_net, log_alpha=self.log_alpha, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic, optimizer_alpha=self.optimizer_alpha, )) def show_logo(self): self.logger.info(''' xxxxxxxxx xx xxxxxx xx x xx xxx xxx xx xx x xx xxx xx xx x x xx xx x xx xx xxx x xxxxxx xxx x xx xx xx xx x xx xx xxx xxx xxxxx xxx xxxxx xxxxxx ''') @property def alpha(self): return tf.exp(self.log_alpha) def choose_action(self, s, visual_s, evaluation=False): mu, pi, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = mu.numpy() if evaluation else pi.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, _ = tsallis_squash_rsample(mu, log_std, self.entropic_index) mu = tf.tanh(mu) # squash mu else: logits = self.actor_net(feat) mu = tf.argmax(logits, axis=1) cate_dist = tfp.distributions.Categorical(logits) pi = cate_dist.sample() return mu, pi, cell_state def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _train(memories, isw, crsty_loss, cell_state): td_error, summaries = self.train(memories, isw, crsty_loss, cell_state) if self.annealing and not self.auto_adaption: self.log_alpha.assign( tf.math.log( tf.cast(self.alpha_annealing(self.global_step.numpy()), tf.float32))) return td_error, summaries for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': self.train, 'update_function': lambda: update_target_net_weights( self.critic_target_net.weights, self.critic_net. weights, self.ployak), 'summary_dict': dict([[ 'LEARNING_RATE/actor_lr', self.actor_lr(self.train_step) ], [ 'LEARNING_RATE/critic_lr', self.critic_lr(self.train_step) ], [ 'LEARNING_RATE/alpha_lr', self.alpha_lr(self.train_step) ]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: target_mu, target_log_std = self.actor_net(feat_) target_log_std = clip_nn_log_std(target_log_std) target_pi, target_log_pi = tsallis_squash_rsample( target_mu, target_log_std, self.entropic_index) else: target_logits = self.actor_net(feat_) target_cate_dist = tfp.distributions.Categorical( target_logits) target_pi = target_cate_dist.sample() target_log_pi = target_cate_dist.log_prob(target_pi) target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32) q1, q2 = self.critic_net(feat, a) q1_target, q2_target = self.critic_target_net(feat_, target_pi) dc_r_q1 = tf.stop_gradient( r + self.gamma * (1 - done) * (q1_target - self.alpha * target_log_pi)) dc_r_q2 = tf.stop_gradient( r + self.gamma * (1 - done) * (q2_target - self.alpha * target_log_pi)) td_error1 = q1 - dc_r_q1 td_error2 = q2 - dc_r_q2 q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = tsallis_squash_rsample( mu, log_std, self.entropic_index) entropy = gaussian_entropy(log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) pi = _pi_diff + _pi log_pi = tf.reduce_sum(tf.multiply(logp_all, pi), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) q_s_pi = self.critic_net.get_min(feat, pi) actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) if self.auto_adaption: with tf.GradientTape() as tape: if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = tsallis_squash_rsample( mu, log_std, self.entropic_index) else: logits = self.actor_net(feat) cate_dist = tfp.distributions.Categorical(logits) log_pi = cate_dist.log_prob(cate_dist.sample()) alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(log_pi + self.target_entropy)) alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients([(alpha_grad, self.log_alpha)]) self.global_step.assign_add(1) summaries = dict( [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', entropy], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))]]) if self.auto_adaption: summaries.update({'LOSS/alpha_loss': alpha_loss}) return (td_error1 + td_error2) / 2, summaries @tf.function(experimental_relax_shapes=True) def train_persistent(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: mu, log_std = self.actor_net(feat) log_std = clip_nn_log_std(log_std, self.log_std_min, self.log_std_max) pi, log_pi = tsallis_squash_rsample( mu, log_std, self.entropic_index) entropy = gaussian_entropy(log_std) target_mu, target_log_std = self.actor_net(feat_) target_log_std = clip_nn_log_std(target_log_std) target_pi, target_log_pi = tsallis_squash_rsample( target_mu, target_log_std, self.entropic_index) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) pi = _pi_diff + _pi log_pi = tf.reduce_sum(tf.multiply(logp_all, pi), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) target_logits = self.actor_net(feat_) target_cate_dist = tfp.distributions.Categorical( target_logits) target_pi = target_cate_dist.sample() target_pi = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32) target_log_pi = target_cate_dist.log_prob(target_pi) q1, q2 = self.critic_net(feat, a) q1_target, q2_target = self.critic_target_net(feat_, target_pi) q_s_pi = self.critic_net.get_min(feat, pi) dc_r_q1 = tf.stop_gradient( r + self.gamma * (1 - done) * (q1_target - self.alpha * target_log_pi)) dc_r_q2 = tf.stop_gradient( r + self.gamma * (1 - done) * (q2_target - self.alpha * target_log_pi)) td_error1 = q1 - dc_r_q1 td_error2 = q2 - dc_r_q2 q1_loss = tf.reduce_mean(tf.square(td_error1) * isw) q2_loss = tf.reduce_mean(tf.square(td_error2) * isw) critic_loss = 0.5 * q1_loss + 0.5 * q2_loss + crsty_loss actor_loss = -tf.reduce_mean(q_s_pi - self.alpha * log_pi) if self.auto_adaption: alpha_loss = -tf.reduce_mean( self.alpha * tf.stop_gradient(log_pi + self.target_entropy)) critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) if self.auto_adaption: alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.optimizer_alpha.apply_gradients([(alpha_grad, self.log_alpha)]) self.global_step.assign_add(1) summaries = dict( [['LOSS/actor_loss', actor_loss], ['LOSS/q1_loss', q1_loss], ['LOSS/q2_loss', q2_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/log_alpha', self.log_alpha], ['Statistics/alpha', self.alpha], ['Statistics/entropy', entropy], ['Statistics/q_min', tf.reduce_min(tf.minimum(q1, q2))], ['Statistics/q_mean', tf.reduce_mean(tf.minimum(q1, q2))], ['Statistics/q_max', tf.reduce_max(tf.maximum(q1, q2))]]) if self.auto_adaption: summaries.update({'LOSS/alpha_loss': alpha_loss}) return (td_error1 + td_error2) / 2, summaries
class SQL(make_off_policy_class(mode='share')): ''' Soft Q-Learning. Reinforcement Learning with Deep Energy-Based Policies: https://arxiv.org/abs/1702.08165 ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, lr=5.0e-4, alpha=2, ployak=0.995, hidden_units=[32, 32], **kwargs): assert not is_continuous, 'sql only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.alpha = alpha self.ployak = ployak def _q_net(): return NetWork(self.feat_dim, self.a_dim, hidden_units) self.q_net = _q_net() self.q_target_net = _q_net() self.critic_tv = self.q_net.trainable_variables + self.other_tv self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer)) def show_logo(self): self.logger.info(''' xxxxxx xxxxxx xxxxx xxxxxxx xxxxxxxxx xxx xxx xxx xxxx xxx xx xxx xx xxx xxxx xx xxxxx xxx xxx xx xxxxx xxx xxx xx xxxxxx xxx xxx xx xxxx xxx xxx xx xx xxxx xxx xxx xx x xx xx xxxx xxx xx xx xxxxxxxx xxxxxxxxx xxxxxxxxx xxxxxxx xxxxxx xxxx xxxx ''') def choose_action(self, s, visual_s, evaluation=False): a, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = a.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) q_values = self.q_net(feat) logits = tf.math.exp( (q_values - self.get_v(q_values)) / self.alpha) cate_dist = tfp.distributions.Categorical(logits) pi = cate_dist.sample() return pi, cell_state @tf.function def get_v(self, q): with tf.device(self.device): v = self.alpha * tf.math.log( tf.reduce_mean( tf.math.exp(q / self.alpha), axis=1, keepdims=True)) return v def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': self.train, 'update_function': lambda: update_target_net_weights( self.q_target_net.weights, self.q_net.weights, self. ployak), 'summary_dict': dict([['LEARNING_RATE/lr', self.lr(self.train_step)]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) q = self.q_net(feat) q_next = self.q_target_net(feat_) v_next = self.get_v(q_next) q_eval = tf.reduce_sum(tf.multiply(q, a), axis=1, keepdims=True) q_target = tf.stop_gradient(r + self.gamma * (1 - done) * v_next) td_error = q_eval - q_target q_loss = tf.reduce_mean(tf.square(td_error) * isw) + crsty_loss grads = tape.gradient(q_loss, self.critic_tv) self.optimizer.apply_gradients(zip(grads, self.critic_tv)) self.global_step.assign_add(1) return td_error, dict( [['LOSS/loss', q_loss], ['Statistics/q_max', tf.reduce_max(q_eval)], ['Statistics/q_min', tf.reduce_min(q_eval)], ['Statistics/q_mean', tf.reduce_mean(q_eval)]])
class DDPG(make_off_policy_class(mode='share')): ''' Deep Deterministic Policy Gradient, https://arxiv.org/abs/1509.02971 ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak self.discrete_tau = discrete_tau if self.is_continuous: def _actor_net(): return ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) # self.action_noise = NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim)) self.action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim)) else: def _actor_net(): return ActorDcs(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_net = _actor_net() self.actor_target_net = _actor_net() self.actor_tv = self.actor_net.trainable_variables def _q_net(): return Critic(self.feat_dim, self.a_dim, hidden_units['q']) self.q_net = _q_net() self.q_target_net = _q_net() self.critic_tv = self.q_net.trainable_variables + self.other_tv update_target_net_weights( self.actor_target_net.weights + self.q_target_net.weights, self.actor_net.weights + self.q_net.weights) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder( dict(actor=self.actor_net, critic=self.q_net, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic)) def show_logo(self): self.logger.info(''' xxxxxxx xxxxxxx xxxxxxxx xxxxxx x xxx x xxx xx xx xxx xx x xx x xx x xxx xx x x xx x xx x xxx xx x xxx x xxx xxxxxx x xxxxx x xx x xx x xx xxx x xx x xx x xx x x xxx x xxx x xxx xx xxxxxxx xxxxxxx xxxxx xxxxxx xx ''') def choose_action(self, s, visual_s, evaluation=False): mu, pi, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = mu.numpy() if evaluation else pi.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) if self.is_continuous: mu = self.actor_net(feat) pi = tf.clip_by_value(mu + self.action_noise(), -1, 1) else: logits = self.actor_net(feat) mu = tf.argmax(logits, axis=1) cate_dist = tfp.distributions.Categorical(logits) pi = cate_dist.sample() return mu, pi, cell_state def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': self.train, 'update_function': lambda: update_target_net_weights( self.actor_target_net.weights + self.q_target_net. weights, self.actor_net.weights + self.q_net.weights, self.ployak), 'summary_dict': dict([[ 'LEARNING_RATE/actor_lr', self.actor_lr(self.train_step) ], [ 'LEARNING_RATE/critic_lr', self.critic_lr(self.train_step) ]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: target_mu = self.actor_target_net(feat_) action_target = tf.clip_by_value( target_mu + self.action_noise(), -1, 1) else: target_logits = self.actor_target_net(feat_) logp_all = tf.nn.log_softmax(target_logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) action_target = _pi_diff + _pi q = self.q_net(feat, a) q_target = self.q_target_net(feat_, action_target) dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done)) td_error = q - dc_r q_loss = 0.5 * tf.reduce_mean( tf.square(td_error) * isw) + crsty_loss q_grads = tape.gradient(q_loss, self.critic_tv) self.optimizer_critic.apply_gradients(zip(q_grads, self.critic_tv)) with tf.GradientTape() as tape: if self.is_continuous: mu = self.actor_net(feat) else: logits = self.actor_net(feat) _pi = tf.nn.softmax(logits) _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1), self.a_dim, dtype=tf.float32) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) mu = _pi_diff + _pi q_actor = self.q_net(feat, mu) actor_loss = -tf.reduce_mean(q_actor) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) self.global_step.assign_add(1) return td_error, dict([['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', q_loss], ['Statistics/q_min', tf.reduce_min(q)], ['Statistics/q_mean', tf.reduce_mean(q)], ['Statistics/q_max', tf.reduce_max(q)]]) @tf.function(experimental_relax_shapes=True) def train_persistent(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: target_mu = self.actor_target_net(feat_) action_target = tf.clip_by_value( target_mu + self.action_noise(), -1, 1) mu = self.actor_net(feat) else: target_logits = self.actor_target_net(feat_) logp_all = tf.nn.log_softmax(target_logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax( (logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) action_target = _pi_diff + _pi logits = self.actor_net(feat) _pi = tf.nn.softmax(logits) _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1), self.a_dim, dtype=tf.float32) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) mu = _pi_diff + _pi q = self.q_net(feat, a) q_target = self.q_target_net(feat_, action_target) dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done)) td_error = q - dc_r q_loss = 0.5 * tf.reduce_mean( tf.square(td_error) * isw) + crsty_loss q_actor = self.q_net(feat, mu) actor_loss = -tf.reduce_mean(q_actor) q_grads = tape.gradient(q_loss, self.critic_tv) self.optimizer_critic.apply_gradients(zip(q_grads, self.critic_tv)) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) self.global_step.assign_add(1) return td_error, dict([['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', q_loss], ['Statistics/q_min', tf.reduce_min(q)], ['Statistics/q_mean', tf.reduce_mean(q)], ['Statistics/q_max', tf.reduce_max(q)]])
class BootstrappedDQN(make_off_policy_class(mode='share')): ''' Deep Exploration via Bootstrapped DQN, http://arxiv.org/abs/1602.04621 ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, head_num=4, hidden_units=[32, 32], **kwargs): assert not is_continuous, 'Bootstrapped DQN only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval self.head_num = head_num self._probs = [1. / head_num for _ in range(head_num)] self.now_head = 0 def _q_net(): return NetWork(self.feat_dim, self.a_dim, self.head_num, hidden_units) self.q_net = _q_net() self.q_target_net = _q_net() self.critic_tv = self.q_net.trainable_variables + self.other_tv update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer)) def show_logo(self): self.logger.info(''' xxxxxxx xxxxxxxx xxxxxx xxxx xxxx xx xxxx xxxxxxxx xxx xxxx xxx x xx xxx xx xxx xxx xxxx xxxx x xx xxx xx xxx xxx xxx xxxxx x xxxxxx xxx xxxx xxx xx xx xx xxx x xxxx x xx xxxx xxx xxxx xxx xx xx xxx xxx x xxxxx xx xxx xxx xx xxx xx xxx xxx xxx x xxxx xx xx xx xxxx xxx xxx x xxx xx xxxx xxxxxxxx xxxxxxxx xxx xx xxxxxxxx xxxxxxx xxxxx xxxx xxx ''') def reset(self): super().reset() self.now_head = np.random.randint(self.head_num) def choose_action(self, s, visual_s, evaluation=False): if np.random.uniform() < self.expl_expt_mng.get_esp( self.train_step, evaluation=evaluation): a = np.random.randint(0, self.a_dim, self.n_agents) else: q, self.cell_state = self._get_action(s, visual_s, self.cell_state) q = q.numpy() a = np.argmax(q[self.now_head], axis=1) # [H, B, A] => [B, A] => [B, ] return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) q_values = self.q_net(feat) # [H, B, A] return q_values, cell_state def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _update(): if self.global_step % self.assign_interval == 0: update_target_net_weights(self.q_target_net.weights, self.q_net.weights) for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': self.train, 'update_function': _update, 'summary_dict': dict([['LEARNING_RATE/lr', self.lr(self.train_step)]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) q = self.q_net(feat) # [H, B, A] q_next = self.q_target_net(feat_) # [H, B, A] q_eval = tf.reduce_sum( tf.multiply(q, a), axis=-1, keepdims=True) # [H, B, A] * [B, A] => [H, B, 1] q_target = tf.stop_gradient( r + self.gamma * (1 - done) * tf.reduce_max(q_next, axis=-1, keepdims=True)) td_error = q_eval - q_target # [H, B, 1] td_error = tf.reduce_sum(td_error, axis=-1) # [H, B] mask_dist = tfp.distributions.Bernoulli(probs=self._probs) mask = tf.transpose(mask_dist.sample(batch_size), [1, 0]) # [H, B] q_loss = tf.reduce_mean(tf.square(td_error) * isw) + crsty_loss grads = tape.gradient(q_loss, self.critic_tv) self.optimizer.apply_gradients(zip(grads, self.critic_tv)) self.global_step.assign_add(1) return tf.reduce_mean(td_error, axis=0), dict([ # [H, B] => ['LOSS/loss', q_loss], ['Statistics/q_max', tf.reduce_max(q_eval)], ['Statistics/q_min', tf.reduce_min(q_eval)], ['Statistics/q_mean', tf.reduce_mean(q_eval)] ])
class HIRO(make_off_policy_class(mode='no_share')): ''' Data-Efficient Hierarchical Reinforcement Learning, http://arxiv.org/abs/1805.08296 ''' def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, high_scale=1.0, reward_scale=1.0, sample_g_nums=100, sub_goal_steps=10, fn_goal_dim=0, intrinsic_reward_mode='os', high_batch_size=256, high_buffer_size=100000, low_batch_size=8, low_buffer_size=10000, high_actor_lr=1.0e-4, high_critic_lr=1.0e-3, low_actor_lr=1.0e-4, low_critic_lr=1.0e-3, hidden_units={ 'high_actor': [64, 64], 'high_critic': [64, 64], 'low_actor': [64, 64], 'low_critic': [64, 64] }, **kwargs): assert visual_sources == 0, 'HIRO doesn\'t support visual inputs.' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.data_high = ExperienceReplay(high_batch_size, high_buffer_size) self.data_low = ExperienceReplay(low_batch_size, low_buffer_size) self.ployak = ployak self.high_scale = np.array( high_scale if isinstance(high_scale, list) else [high_scale] * self.s_dim, dtype=np.float32) self.reward_scale = reward_scale self.fn_goal_dim = fn_goal_dim self.sample_g_nums = sample_g_nums self.sub_goal_steps = sub_goal_steps self.sub_goal_dim = self.s_dim - self.fn_goal_dim self.high_noise = ClippedNormalActionNoise( mu=np.zeros(self.sub_goal_dim), sigma=self.high_scale * np.ones(self.sub_goal_dim), bound=self.high_scale / 2) self.low_noise = ClippedNormalActionNoise(mu=np.zeros(self.a_dim), sigma=1.0 * np.ones(self.a_dim), bound=0.5) def _high_actor_net(): return ActorCts(self.s_dim, self.sub_goal_dim, hidden_units['high_actor']) if self.is_continuous: def _low_actor_net(): return ActorCts(self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units['low_actor']) else: def _low_actor_net(): return ActorDcs(self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units['low_actor']) self.gumbel_dist = tfd.Gumbel(0, 1) self.high_actor = _high_actor_net() self.high_actor_target = _high_actor_net() self.low_actor = _low_actor_net() self.low_actor_target = _low_actor_net() def _high_critic_net(): return Critic(self.s_dim, self.sub_goal_dim, hidden_units['high_critic']) def _low_critic_net(): return Critic(self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units['low_critic']) self.high_critic = DoubleQ(_high_critic_net) self.high_critic_target = DoubleQ(_high_critic_net) self.low_critic = DoubleQ(_low_critic_net) self.low_critic_target = DoubleQ(_low_critic_net) update_target_net_weights( self.low_actor_target.weights + self.low_critic_target.weights + self.high_actor_target.weights + self.high_critic_target.weights, self.low_actor.weights + self.low_critic.weights + self.high_actor.weights + self.high_critic.weights) self.low_actor_lr, self.low_critic_lr = map( self.init_lr, [low_actor_lr, low_critic_lr]) self.high_actor_lr, self.high_critic_lr = map( self.init_lr, [high_actor_lr, high_critic_lr]) self.low_actor_optimizer, self.low_critic_optimizer = map( self.init_optimizer, [self.low_actor_lr, self.low_critic_lr]) self.high_actor_optimizer, self.high_critic_optimizer = map( self.init_optimizer, [self.high_actor_lr, self.high_critic_lr]) self.model_recorder( dict(high_actor=self.high_actor, high_critic=self.high_critic, low_actor=self.low_actor, low_critic=self.low_critic, low_actor_optimizer=self.low_actor_optimizer, low_critic_optimizer=self.low_critic_optimizer, high_actor_optimizer=self.high_actor_optimizer, high_critic_optimizer=self.high_critic_optimizer)) self.counts = 0 self._high_s = [[] for _ in range(self.n_agents)] self._noop_subgoal = np.random.uniform(-self.high_scale, self.high_scale, size=(self.n_agents, self.sub_goal_dim)) self.get_ir = self.generate_ir_func(mode=intrinsic_reward_mode) def generate_ir_func(self, mode='os'): if mode == 'os': return lambda last_feat, subgoal, feat: -tf.norm( last_feat + subgoal - feat, ord=2, axis=-1, keepdims=True) elif mode == 'cos': return lambda last_feat, subgoal, feat: tf.expand_dims( -tf.keras.losses.cosine_similarity( tf.cast(feat - last_feat, tf.float32), tf.cast(subgoal, tf.float32), axis=-1), axis=-1) def show_logo(self): self.logger.info(''' xxxxx xxxxx xxxx xxxxxxx xxxxxx xx xx xx xxxxxxx xxx xxxx xx xx xx xx xxx xxx xxx xx xx xx xx xxx xx xxx xxxxxxx xx xxxxxx xx xxx xx xx xx xxxxxx xx xxx xx xx xx xx xxxx xx xxx xx xx xx xx xxx xxx xxx xxxxx xxxxx xxxx xxxxx xxxx xxxxxxx ''') def store_high_buffer(self, i): eps_len = len(self._high_s[i]) intervals = list(range(0, eps_len, self.sub_goal_steps)) if len(intervals) < 1: return left = intervals[:-1] right = intervals[1:] s, r, a, g, d, s_ = [], [], [], [], [], [] for _l, _r in zip(left, right): s.append(self._high_s[i][_l:_r]) r.append(sum(self._high_r[i][_l:_r]) * self.reward_scale) a.append(self._high_a[i][_l:_r]) g.append(self._subgoals[i][_l]) d.append(self._done[i][_r - 1]) s_.append(self._high_s_[i][_r - 1]) right = intervals[-1] s.append(self._high_s[i][right:eps_len] + [self._high_s[i][-1]] * (self.sub_goal_steps + right - eps_len)) r.append(sum(self._high_r[i][right:eps_len])) a.append(self._high_a[i][right:eps_len] + [self._high_a[i][-1]] * (self.sub_goal_steps + right - eps_len)) g.append(self._subgoals[i][right]) d.append(self._done[i][-1]) s_.append(self._high_s_[i][-1]) self.data_high.add(np.array(s), np.array(r)[:, np.newaxis], np.array(a), np.array(g), np.array(d)[:, np.newaxis], np.array(s_)) def reset(self): self._c = np.full((self.n_agents, 1), self.sub_goal_steps, np.int32) for i in range(self.n_agents): self.store_high_buffer(i) self._high_r = [[] for _ in range(self.n_agents)] self._high_a = [[] for _ in range(self.n_agents)] self._high_s = [[] for _ in range(self.n_agents)] self._subgoals = [[] for _ in range(self.n_agents)] self._done = [[] for _ in range(self.n_agents)] self._high_s_ = [[] for _ in range(self.n_agents)] self._new_subgoal = np.zeros((self.n_agents, self.sub_goal_dim), dtype=np.float32) def partial_reset(self, done): self._c = np.where( done[:, np.newaxis], np.full((self.n_agents, 1), self.sub_goal_steps, np.int32), self._c) idx = np.where(done)[0] for i in idx: self.store_high_buffer(i) self._high_s[i] = [] self._high_a[i] = [] self._high_s_[i] = [] self._high_r[i] = [] self._done[i] = [] self._subgoals[i] = [] @tf.function def _get_action(self, s, visual_s, subgoal): with tf.device(self.device): feat = tf.concat([s, subgoal], axis=-1) if self.is_continuous: mu = self.low_actor(feat) pi = tf.clip_by_value(mu + self.low_noise(), -1, 1) else: logits = self.low_actor(feat) mu = tf.argmax(logits, axis=1) cate_dist = tfd.Categorical(logits) pi = cate_dist.sample() return mu, pi def choose_action(self, s, visual_s, evaluation=False): self._subgoal = np.where(self._c == self.sub_goal_steps, self.get_subgoal(s).numpy(), self._new_subgoal) mu, pi = self._get_action(s, visual_s, self._subgoal) a = mu.numpy() if evaluation else pi.numpy() return a @tf.function def get_subgoal(self, s): ''' last_s 上一个隐状态 subgoal 上一个子目标 s 当前隐状态 ''' new_subgoal = self.high_scale * self.high_actor(s) new_subgoal = tf.clip_by_value(new_subgoal + self.high_noise(), -self.high_scale, self.high_scale) return new_subgoal def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): if self.data_low.is_lg_batch_size and self.data_high.is_lg_batch_size: self.intermediate_variable_reset() low_data = self.get_transitions( self.data_low, data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_']) high_data = self.get_transitions( self.data_high, data_name_list=['s', 'r', 'a', 'g', 'done', 's_']) # --------------------------------------获取需要传给train函数的参数 _low_training_data = self.get_value_from_dict( data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_'], data_dict=low_data) _high_training_data = self.get_value_from_dict( data_name_list=['s', 'r', 'a', 'g', 'done', 's_'], data_dict=high_data) summaries = self.train_low(_low_training_data) self.summaries.update(summaries) update_target_net_weights( self.low_actor_target.weights + self.low_critic_target.weights, self.low_actor.weights + self.low_critic.weights, self.ployak) if self.counts % self.sub_goal_steps == 0: self.counts = 0 high_summaries = self.train_high(_high_training_data) self.summaries.update(high_summaries) update_target_net_weights( self.high_actor_target.weights + self.high_critic_target.weights, self.high_actor.weights + self.high_critic.weights, self.ployak) self.counts += 1 self.summaries.update( dict([[ 'LEARNING_RATE/low_actor_lr', self.low_actor_lr(self.train_step) ], [ 'LEARNING_RATE/low_critic_lr', self.low_critic_lr(self.train_step) ], [ 'LEARNING_RATE/high_actor_lr', self.high_actor_lr(self.train_step) ], [ 'LEARNING_RATE/high_critic_lr', self.high_critic_lr(self.train_step) ]])) self.write_training_summaries(self.global_step, self.summaries) @tf.function(experimental_relax_shapes=True) def train_low(self, memories): s, a, r, s_, done, g, g_ = memories with tf.device(self.device): with tf.GradientTape() as tape: feat = tf.concat([s, g], axis=-1) feat_ = tf.concat([s_, g_], axis=-1) if self.is_continuous: target_mu = self.low_actor_target(feat_) action_target = tf.clip_by_value( target_mu + self.low_noise(), -1, 1) else: target_logits = self.low_actor_target(feat_) logp_all = tf.nn.log_softmax(target_logits) gumbel_noise = tf.cast(self.gumbel_dist.sample( [tf.shape(feat_)[0], self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax((logp_all + gumbel_noise) / 1.) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) action_target = _pi_diff + _pi q1, q2 = self.low_critic(feat, a) q = tf.minimum(q1, q2) q_target = self.low_critic_target.get_min(feat_, action_target) dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done)) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1)) q2_loss = tf.reduce_mean(tf.square(td_error2)) low_critic_loss = q1_loss + q2_loss low_critic_grads = tape.gradient(low_critic_loss, self.low_critic.weights) self.low_critic_optimizer.apply_gradients( zip(low_critic_grads, self.low_critic.weights)) with tf.GradientTape() as tape: if self.is_continuous: mu = self.low_actor(feat) else: logits = self.low_actor(feat) _pi = tf.nn.softmax(logits) _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1), self.a_dim, dtype=tf.float32) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) mu = _pi_diff + _pi q_actor = self.low_critic.Q1(feat, mu) low_actor_loss = -tf.reduce_mean(q_actor) low_actor_grads = tape.gradient(low_actor_loss, self.low_actor.trainable_variables) self.low_actor_optimizer.apply_gradients( zip(low_actor_grads, self.low_actor.trainable_variables)) self.global_step.assign_add(1) return dict([['LOSS/low_actor_loss', low_actor_loss], ['LOSS/low_critic_loss', low_critic_loss], ['Statistics/low_q_min', tf.reduce_min(q)], ['Statistics/low_q_mean', tf.reduce_mean(q)], ['Statistics/low_q_max', tf.reduce_max(q)]]) @tf.function(experimental_relax_shapes=True) def train_high(self, memories): # s_ : [B, N] ss, r, aa, g, done, s_ = memories batchs = tf.shape(ss)[0] # ss, aa [B, T, *] with tf.device(self.device): with tf.GradientTape() as tape: s = ss[:, 0] # [B, N] true_end = (s_ - s)[:, self.fn_goal_dim:] g_dist = tfd.Normal(loc=true_end, scale=0.5 * self.high_scale[None, :]) ss = tf.expand_dims(ss, 0) # [1, B, T, *] ss = tf.tile(ss, [self.sample_g_nums, 1, 1, 1]) # [10, B, T, *] ss = tf.reshape(ss, [-1, tf.shape(ss)[-1]]) # [10*B*T, *] aa = tf.expand_dims(aa, 0) # [1, B, T, *] aa = tf.tile(aa, [self.sample_g_nums, 1, 1, 1]) # [10, B, T, *] aa = tf.reshape(aa, [-1, tf.shape(aa)[-1]]) # [10*B*T, *] gs = tf.concat([ tf.expand_dims(g, 0), tf.expand_dims(true_end, 0), tf.clip_by_value(g_dist.sample(self.sample_g_nums - 2), -self.high_scale, self.high_scale) ], axis=0) # [10, B, N] all_g = gs + s[:, self.fn_goal_dim:] all_g = tf.expand_dims(all_g, 2) # [10, B, 1, N] all_g = tf.tile( all_g, [1, 1, self.sub_goal_steps, 1]) # [10, B, T, N] all_g = tf.reshape(all_g, [-1, tf.shape(all_g)[-1]]) # [10*B*T, N] all_g = all_g - ss[:, self.fn_goal_dim:] # [10*B*T, N] feat = tf.concat([ss, all_g], axis=-1) # [10*B*T, *] _aa = self.low_actor(feat) # [10*B*T, A] if not self.is_continuous: _aa = tf.one_hot(tf.argmax(_aa, axis=-1), self.a_dim, dtype=tf.float32) diff = _aa - aa diff = tf.reshape( diff, [self.sample_g_nums, batchs, self.sub_goal_steps, -1 ]) # [10, B, T, A] diff = tf.transpose(diff, [1, 0, 2, 3]) # [B, 10, T, A] logps = -0.5 * tf.reduce_sum(tf.norm(diff, ord=2, axis=-1)**2, axis=-1) # [B, 10] idx = tf.argmax(logps, axis=-1, output_type=tf.int32) idx = tf.stack([tf.range(batchs), idx], axis=1) # [B, 2] g = tf.gather_nd(tf.transpose(gs, [1, 0, 2]), idx) # [B, N] q1, q2 = self.high_critic(s, g) q = tf.minimum(q1, q2) target_sub_goal = self.high_actor_target(s_) * self.high_scale target_sub_goal = tf.clip_by_value( target_sub_goal + self.high_noise(), -self.high_scale, self.high_scale) q_target = self.high_critic_target.get_min(s_, target_sub_goal) dc_r = tf.stop_gradient(r + self.gamma * (1 - done) * q_target) td_error1 = q1 - dc_r td_error2 = q2 - dc_r q1_loss = tf.reduce_mean(tf.square(td_error1)) q2_loss = tf.reduce_mean(tf.square(td_error2)) high_critic_loss = q1_loss + q2_loss high_critic_grads = tape.gradient(high_critic_loss, self.high_critic.weights) self.high_critic_optimizer.apply_gradients( zip(high_critic_grads, self.high_critic.weights)) with tf.GradientTape() as tape: mu = self.high_actor(s) * self.high_scale q_actor = self.high_critic.Q1(s, mu) high_actor_loss = -tf.reduce_mean(q_actor) high_actor_grads = tape.gradient( high_actor_loss, self.high_actor.trainable_variables) self.high_actor_optimizer.apply_gradients( zip(high_actor_grads, self.high_actor.trainable_variables)) return dict([['LOSS/high_actor_loss', high_actor_loss], ['LOSS/high_critic_loss', high_critic_loss], ['Statistics/high_q_min', tf.reduce_min(q)], ['Statistics/high_q_mean', tf.reduce_mean(q)], ['Statistics/high_q_max', tf.reduce_max(q)]]) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance(a, np.ndarray), "store need action type is np.ndarray" assert isinstance(r, np.ndarray), "store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "store need done type is np.ndarray" [o.append(_s) for o, _s in zip(self._high_s, s)] [o.append(_a) for o, _a in zip(self._high_a, a)] [o.append(_r) for o, _r in zip(self._high_r, r)] [o.append(_s_) for o, _s_ in zip(self._high_s_, s_)] [o.append(_d) for o, _d in zip(self._done, done)] [ o.append(_subgoal) for o, _subgoal in zip(self._subgoals, self._noop_subgoal) ] ir = self.get_ir(s[:, self.fn_goal_dim:], self._noop_subgoal, s_[:, self.fn_goal_dim:]) # subgoal = s[:, self.fn_goal_dim:] + self._noop_subgoal - s_[:, self.fn_goal_dim:] subgoal = np.random.uniform(-self.high_scale, self.high_scale, size=(self.n_agents, self.sub_goal_dim)) self.data_low.add( s, a, ir, s_, done[:, np.newaxis], # 升维 self._noop_subgoal, subgoal) self._noop_subgoal = subgoal def store_data(self, s, visual_s, a, r, s_, visual_s_, done): """ for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer. """ assert isinstance(a, np.ndarray), "store need action type is np.ndarray" assert isinstance(r, np.ndarray), "store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "store need done type is np.ndarray" [o.append(_s) for o, _s in zip(self._high_s, s)] [o.append(_a) for o, _a in zip(self._high_a, a)] [o.append(_r) for o, _r in zip(self._high_r, r)] [o.append(_s_) for o, _s_ in zip(self._high_s_, s_)] [o.append(_d) for o, _d in zip(self._done, done)] [ o.append(_subgoal) for o, _subgoal in zip(self._subgoals, self._subgoal) ] ir = self.get_ir(s[:, self.fn_goal_dim:], self._subgoal, s_[:, self.fn_goal_dim:]) self._new_subgoal = np.where( self._c == 1, self.get_subgoal(s_).numpy(), s[:, self.fn_goal_dim:] + self._subgoal - s_[:, self.fn_goal_dim:]) self.data_low.add( s, a, ir, s_, done[:, np.newaxis], # 升维 self._subgoal, self._new_subgoal) self._c = np.where( self._c == 1, np.full((self.n_agents, 1), self.sub_goal_steps, np.int32), self._c - 1) def get_transitions(self, databuffer, data_name_list=['s', 'a', 'r', 's_', 'done']): ''' TODO: Annotation ''' data = databuffer.sample() # 经验池取数据 if not self.is_continuous and 'a' in data_name_list: a_idx = data_name_list.index('a') a = data[a_idx].astype(np.int32) pre_shape = a.shape a = a.reshape(-1) a = int2one_hot(a, self.a_dim) a = a.reshape(pre_shape + (-1, )) data[a_idx] = a return dict([[ n, d ] for n, d in zip(data_name_list, list(map(self.data_convert, data)))])
class PD_DDPG(make_off_policy_class(mode='share')): ''' Accelerated Primal-Dual Policy Optimization for Safe Reinforcement Learning, http://arxiv.org/abs/1802.06480 Refer to https://github.com/anita-hu/TF2-RL/blob/master/Primal-Dual_DDPG/TF2_PD_DDPG_Basic.py ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, ployak=0.995, actor_lr=5.0e-4, reward_critic_lr=1.0e-3, cost_critic_lr=1.0e-3, lambda_lr=5.0e-4, discrete_tau=1.0, cost_constraint=1.0, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'reward': [32, 32], 'cost': [32, 32] }, **kwargs): super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.ployak = ployak self.discrete_tau = discrete_tau self._lambda = tf.Variable(0.0, dtype=tf.float32) self.cost_constraint = cost_constraint # long tern cost <= d if self.is_continuous: def _actor_net(): return ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) # self.action_noise = NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim)) self.action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim)) else: def _actor_net(): return ActorDcs(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.actor_net = _actor_net() self.actor_target_net = _actor_net() self.actor_tv = self.actor_net.trainable_variables def _critic_net(hiddens): return Critic(self.feat_dim, self.a_dim, hiddens) self.reward_critic_net = _critic_net(hidden_units['reward']) self.reward_critic_target_net = _critic_net(hidden_units['reward']) self.cost_critic_net = _critic_net(hidden_units['cost']) self.cost_critic_target_net = _critic_net(hidden_units['cost']) self.reward_critic_tv = self.reward_critic_net.trainable_variables + self.other_tv update_target_net_weights( self.actor_target_net.weights + self.reward_critic_target_net.weights + self.cost_critic_target_net.weights, self.actor_net.weights + self.reward_critic_net.weights + self.cost_critic_net.weights ) self.lambda_lr = lambda_lr self.actor_lr, self.reward_critic_lr, self.cost_critic_lr = map(self.init_lr, [actor_lr, reward_critic_lr, cost_critic_lr]) self.optimizer_actor, self.optimizer_reward_critic, self.optimizer_cost_critic = map(self.init_optimizer, [self.actor_lr, self.reward_critic_lr, self.cost_critic_lr]) self.model_recorder(dict( actor=self.actor_net, reward_critic=self.reward_critic_net, cost_critic=self.cost_critic_net, optimizer_actor=self.optimizer_actor, optimizer_reward_critic=self.optimizer_reward_critic, optimizer_cost_critic=self.optimizer_cost_critic )) def show_logo(self): self.logger.info(''' xxxxxxxx xxxxxxx xxxxxxx xxxxxxx xxxxxxxx xxxxxx xx xx x xxx x xxx x xxx xx xx xxx xx x xxx x xx x xx x xx x xxx xx x x xxx x xx x xx x xx x xxx xx xxxxxx x xxx x xxx x xxx xxxxxx x xxxxx x x xx x xx x xx x xx xxx x x xx x xx x xx x xx x x x xxx x xxx x xxx x xxx xx xxxxx xxxxxxx xxxxxxx xxxxxxx xxxxx xxxxxx xx ''') def choose_action(self, s, visual_s, evaluation=False): mu, pi, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = mu.numpy() if evaluation else pi.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) if self.is_continuous: mu = self.actor_net(feat) pi = tf.clip_by_value(mu + self.action_noise(), -1, 1) else: logits = self.actor_net(feat) mu = tf.argmax(logits, axis=1) cate_dist = tfp.distributions.Categorical(logits) pi = cate_dist.sample() return mu, pi, cell_state def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): self._learn(function_dict={ 'train_function': self.train, 'update_function': lambda: update_target_net_weights( self.actor_target_net.weights + self.reward_critic_target_net.weights + self.cost_critic_target_net.weights, self.actor_net.weights + self.reward_critic_net.weights + self.cost_critic_net.weights, self.ployak), 'summary_dict': dict([ ['LEARNING_RATE/actor_lr', self.actor_lr(self.train_step)], ['LEARNING_RATE/reward_critic_lr', self.reward_critic_lr(self.train_step)], ['LEARNING_RATE/cost_critic_lr', self.cost_critic_lr(self.train_step)] ]), 'sample_data_list': ['s', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'cost'], 'train_data_list': ['ss', 'vvss', 'a', 'r', 'done', 'cost'], }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done, cost = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: target_mu = self.actor_target_net(feat_) action_target = tf.clip_by_value(target_mu + self.action_noise(), -1, 1) else: target_logits = self.actor_target_net(feat_) logp_all = tf.nn.log_softmax(target_logits) gumbel_noise = tf.cast(self.gumbel_dist.sample([batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax((logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) action_target = _pi_diff + _pi q_reward = self.reward_critic_net(feat, a) q_target = self.reward_critic_target_net(feat_, action_target) dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done)) td_error_reward = q_reward - dc_r reward_loss = 0.5 * tf.reduce_mean(tf.square(td_error_reward) * isw) + crsty_loss q_grads = tape.gradient(reward_loss, self.reward_critic_tv) self.optimizer_reward_critic.apply_gradients( zip(q_grads, self.reward_critic_tv) ) with tf.GradientTape() as tape: q_cost = self.cost_critic_net(feat, a) q_target = self.cost_critic_target_net(feat_, action_target) dc_r = tf.stop_gradient(cost + self.gamma * q_target * (1 - done)) td_error_cost = q_cost - dc_r cost_loss = 0.5 * tf.reduce_mean(tf.square(td_error_cost) * isw) + crsty_loss q_grads = tape.gradient(cost_loss, self.cost_critic_net.trainable_variables) self.optimizer_cost_critic.apply_gradients( zip(q_grads, self.cost_critic_net.trainable_variables) ) q_loss = reward_loss + cost_loss with tf.GradientTape() as tape: if self.is_continuous: mu = self.actor_net(feat) else: logits = self.actor_net(feat) _pi = tf.nn.softmax(logits) _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1), self.a_dim, dtype=tf.float32) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) mu = _pi_diff + _pi reward_actor = self.reward_critic_net(feat, mu) cost_actor = self.cost_critic_net(feat, mu) actor_loss = -tf.reduce_mean(reward_actor - self._lambda * cost_actor) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv) ) # update dual variable lambda_update = tf.reduce_mean(cost_actor - self.cost_constraint) self._lambda.assign_add(self.lambda_lr * lambda_update) self._lambda.assign(tf.maximum(self._lambda, 0.0)) self.global_step.assign_add(1) return (td_error_reward + td_error_cost) / 2, dict([ ['LOSS/actor_loss', actor_loss], ['LOSS/reward_loss', reward_loss], ['LOSS/cost_loss', cost_loss], ['LOSS/q_loss', q_loss], ['Statistics/q_reward_min', tf.reduce_min(q_reward)], ['Statistics/q_reward_mean', tf.reduce_mean(q_reward)], ['Statistics/q_reward_max', tf.reduce_max(q_reward)], ['Statistics/q_cost_min', tf.reduce_min(q_cost)], ['Statistics/q_cost_mean', tf.reduce_mean(q_cost)], ['Statistics/q_cost_max', tf.reduce_max(q_cost)], ['Statistics/_lambda', self._lambda], ['Statistics/lambda_update', lambda_update] ]) @tf.function(experimental_relax_shapes=True) def train_persistent(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: target_mu = self.actor_target_net(feat_) action_target = tf.clip_by_value(target_mu + self.action_noise(), -1, 1) mu = self.actor_net(feat) else: target_logits = self.actor_target_net(feat_) logp_all = tf.nn.log_softmax(target_logits) gumbel_noise = tf.cast(self.gumbel_dist.sample([batch_size, self.a_dim]), dtype=tf.float32) _pi = tf.nn.softmax((logp_all + gumbel_noise) / self.discrete_tau) _pi_true_one_hot = tf.one_hot(tf.argmax(_pi, axis=-1), self.a_dim) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) action_target = _pi_diff + _pi logits = self.actor_net(feat) _pi = tf.nn.softmax(logits) _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1), self.a_dim, dtype=tf.float32) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) mu = _pi_diff + _pi q_reward = self.reward_critic_net(feat, a) q_target = self.reward_critic_target_net(feat_, action_target) dc_r = tf.stop_gradient(r + self.gamma * q_target * (1 - done)) td_error_reward = q_reward - dc_r reward_loss = 0.5 * tf.reduce_mean(tf.square(td_error_reward) * isw) + crsty_loss q_cost = self.cost_critic_net(tf.stop_gradient(feat), a) q_target = self.cost_critic_target_net(feat_, action_target) dc_r = tf.stop_gradient(cost + self.gamma * q_target * (1 - done)) td_error_cost = q_cost - dc_r cost_loss = 0.5 * tf.reduce_mean(tf.square(td_error_cost) * isw) + crsty_loss q_loss = reward_loss + cost_loss reward_actor = self.reward_critic_net(feat, mu) cost_actor = self.cost_critic_net(feat, mu) actor_loss = -tf.reduce_mean(reward_actor - self._lambda * cost_actor) q_grads = tape.gradient(reward_loss, self.reward_critic_tv) self.optimizer_reward_critic.apply_gradients( zip(q_grads, self.reward_critic_tv) ) q_grads = tape.gradient(cost_loss, self.cost_critic_net.trainable_variables) self.optimizer_cost_critic.apply_gradients( zip(q_grads, self.cost_critic_net.trainable_variables) ) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv) ) # update dual variable lambda_update = tf.reduce_mean(cost_actor - self.cost_constraint) self._lambda.assign_add(self.lambda_lr * lambda_update) self._lambda.assign(tf.maximum(self._lambda, 0.0)) self.global_step.assign_add(1) return (td_error_reward + td_error_cost) / 2, dict([ ['LOSS/actor_loss', actor_loss], ['LOSS/reward_loss', reward_loss], ['LOSS/cost_loss', cost_loss], ['LOSS/q_loss', q_loss], ['Statistics/q_reward_min', tf.reduce_min(q_reward)], ['Statistics/q_reward_mean', tf.reduce_mean(q_reward)], ['Statistics/q_reward_max', tf.reduce_max(q_reward)], ['Statistics/q_cost_min', tf.reduce_min(q_cost)], ['Statistics/q_cost_mean', tf.reduce_mean(q_cost)], ['Statistics/q_cost_max', tf.reduce_max(q_cost)], ['Statistics/_lambda', self._lambda], ['Statistics/lambda_update', lambda_update] ]) def get_cost(self, s, visual_s, a, r, s_, visual_s_, done): return np.abs(s_)[:, :1] # CartPole def store_data(self, s, visual_s, a, r, s_, visual_s_, done): """ for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer. """ assert isinstance(a, np.ndarray), "store need action type is np.ndarray" assert isinstance(r, np.ndarray), "store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "store need done type is np.ndarray" self._running_average(s) cost = self.get_cost(s, visual_s, a, r, s_, visual_s_, done) self.data.add( s, visual_s, a, r[:, np.newaxis], # 升维 s_, visual_s_, done[:, np.newaxis], # 升维 cost ) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance(a, np.ndarray), "no_op_store need action type is np.ndarray" assert isinstance(r, np.ndarray), "no_op_store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "no_op_store need done type is np.ndarray" self._running_average(s) cost = self.get_cost(s, visual_s, a, r, s_, visual_s_, done) self.data.add( s, visual_s, a, r[:, np.newaxis], s_, visual_s_, done[:, np.newaxis], # 升维 cost )
class DDDQN(make_off_policy_class(mode='share')): ''' Dueling Double DQN, https://arxiv.org/abs/1511.06581 ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, hidden_units={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): assert not is_continuous, 'dueling double dqn only support discrete action space' super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _net(): return NetWork(self.feat_dim, self.a_dim, hidden_units) self.dueling_net = _net() self.dueling_target_net = _net() self.critic_tv = self.dueling_net.trainable_variables + self.other_tv update_target_net_weights(self.dueling_target_net.weights, self.dueling_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict( model=self.dueling_net, optimizer=self.optimizer )) def show_logo(self): self.logger.info(''' xxxxxxxx xxxxxxxx xxxxxxxx xxxxxx xxxx xxxx xxxxxxxx xxxxxxxx xxxxxxxx xxx xxxx xxx x xx xxx xx xxx xx xxx xxx xxxx xxxx x xx xxx xx xxx xx xxx xxx xxx xxxxx x xx xx xx xx xx xx xx xxx x xxxx x xx xx xx xx xx xx xxx xxx x xxxxx xx xxx xx xxx xx xxx xxx xxx x xxxx xx xxxx xx xxxx xx xxxx xxx xxx x xxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxx xx xxxxxxx xxxxxxx xxxxxxx xxxxx xxxx xxx ''') def choose_action(self, s, visual_s, evaluation=False): if np.random.uniform() < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation): a = np.random.randint(0, self.a_dim, self.n_agents) else: a, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = a.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) q = self.dueling_net(feat) return tf.argmax(q, axis=-1), cell_state def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _update(): if self.global_step % self.assign_interval == 0: update_target_net_weights(self.dueling_target_net.weights, self.dueling_net.weights) for i in range(self.train_times_per_step): self._learn(function_dict={ 'train_function': self.train, 'update_function': _update, 'summary_dict': dict([['LEARNING_RATE/lr', self.lr(self.train_step)]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) q = self.dueling_net(feat) q_eval = tf.reduce_sum(tf.multiply(q, a), axis=1, keepdims=True) next_q = self.dueling_net(feat_) next_max_action = tf.argmax(next_q, axis=1, name='next_action_int') next_max_action_one_hot = tf.one_hot(tf.squeeze(next_max_action), self.a_dim, 1., 0., dtype=tf.float32) next_max_action_one_hot = tf.cast(next_max_action_one_hot, tf.float32) q_target = self.dueling_target_net(feat_) q_target_next_max = tf.reduce_sum( tf.multiply(q_target, next_max_action_one_hot), axis=1, keepdims=True) q_target = tf.stop_gradient(r + self.gamma * (1 - done) * q_target_next_max) td_error = q_eval - q_target q_loss = tf.reduce_mean(tf.square(td_error) * isw) + crsty_loss grads = tape.gradient(q_loss, self.critic_tv) self.optimizer.apply_gradients( zip(grads, self.critic_tv) ) self.global_step.assign_add(1) return td_error, dict([ ['LOSS/loss', q_loss], ['Statistics/q_max', tf.reduce_max(q_eval)], ['Statistics/q_min', tf.reduce_min(q_eval)], ['Statistics/q_mean', tf.reduce_mean(q_eval)] ])
class DQN(make_off_policy_class(mode='share')): ''' Deep Q-learning Network, DQN, [2013](https://arxiv.org/pdf/1312.5602.pdf), [2015](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) DQN + LSTM, https://arxiv.org/abs/1507.06527 ''' def __init__(self, s_dim: Union[int, np.ndarray], visual_sources: Union[int, np.ndarray], visual_resolution: Union[List, np.ndarray], a_dim: Union[int, np.ndarray], is_continuous: Union[bool, np.ndarray], lr: float = 5.0e-4, eps_init: float = 1, eps_mid: float = 0.2, eps_final: float = 0.01, init2mid_annealing_step: int = 1000, assign_interval: int = 1000, hidden_units: List[int] = [32, 32], **kwargs): assert not is_continuous, 'dqn only support discrete action space' super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _q_net(): return NetWork(self.feat_dim, self.a_dim, hidden_units) self.q_net = _q_net() self.q_target_net = _q_net() self.critic_tv = self.q_net.trainable_variables + self.other_tv update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer)) def show_logo(self) -> NoReturn: self.logger.info(''' xxxxxxxx xxxxxx xxxx xxxx xxxxxxxx xxx xxxx xxx x xx xxx xxx xxxx xxxx x xx xxx xxx xxx xxxxx x xx xx xx xxx x xxxx x xx xx xxx xxx x xxxxx xx xxx xxx xxx x xxxx xx xxxx xxx xxx x xxx xxxxxxxx xxxxxxxx xxx xx xxxxxxx xxxxx xxxx xxx ''') def choose_action(self, s: np.ndarray, visual_s: np.ndarray, evaluation: bool = False) -> np.ndarray: if np.random.uniform() < self.expl_expt_mng.get_esp( self.train_step, evaluation=evaluation): a = np.random.randint(0, self.a_dim, self.n_agents) else: a, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = a.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) q_values = self.q_net(feat) return tf.argmax(q_values, axis=1), cell_state def learn(self, **kwargs) -> NoReturn: self.train_step = kwargs.get('train_step') def _update(): if self.global_step % self.assign_interval == 0: update_target_net_weights(self.q_target_net.weights, self.q_net.weights) for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': self.train, 'update_function': _update, 'summary_dict': dict([['LEARNING_RATE/lr', self.lr(self.train_step)]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) q = self.q_net(feat) q_next = self.q_target_net(feat_) q_eval = tf.reduce_sum(tf.multiply(q, a), axis=1, keepdims=True) q_target = tf.stop_gradient( r + self.gamma * (1 - done) * tf.reduce_max(q_next, axis=1, keepdims=True)) td_error = q_eval - q_target q_loss = tf.reduce_mean(tf.square(td_error) * isw) + crsty_loss grads = tape.gradient(q_loss, self.critic_tv) self.optimizer.apply_gradients(zip(grads, self.critic_tv)) self.global_step.assign_add(1) return td_error, dict( [['LOSS/loss', q_loss], ['Statistics/q_max', tf.reduce_max(q_eval)], ['Statistics/q_min', tf.reduce_min(q_eval)], ['Statistics/q_mean', tf.reduce_mean(q_eval)]])
class QRDQN(make_off_policy_class(mode='share')): ''' Quantile Regression DQN Distributional Reinforcement Learning with Quantile Regression, https://arxiv.org/abs/1710.10044 No double, no dueling, no noisy net. ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, nums=20, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, hidden_units=[128, 128], **kwargs): assert not is_continuous, 'qrdqn only support discrete action space' assert nums > 0 super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.nums = nums self.huber_delta = huber_delta self.quantiles = tf.reshape( tf.constant((2 * np.arange(self.nums) + 1) / (2.0 * self.nums), dtype=tf.float32), [-1, self.nums]) # [1, N] self.batch_quantiles = tf.tile(self.quantiles, [self.a_dim, 1]) # [1, N] => [A, N] self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval def _net(): return NetWork(self.feat_dim, self.a_dim, self.nums, hidden_units) self.q_dist_net = _net() self.q_target_dist_net = _net() self.critic_tv = self.q_dist_net.trainable_variables + self.other_tv update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder( dict(model=self.q_dist_net, optimizer=self.optimizer)) def show_logo(self): self.logger.info(''' xxxxxx xxxxxxx xxxxxxxx xxxxxx xxxx xxxx xxx xxxx xxxxxxx xxxxxxxx xxx xxxx xxx x xxx xxxx xx xxx xx xxx xxx xxxx xxxx x xxx xxx xx xxx xx xxx xxx xxx xxxxx x xx xxx xxxxxx xx xx xx xxx x xxxx x xxx xxx xxxxxx xx xx xxx xxx x xxxxx xxx xxx xx xxxx xx xxx xxx xxx x xxxx xxx xxx xx xxx xx xxxx xxx xxx x xxx xxxxxxxx xxxxx xxxx xxxxxxxx xxxxxxxx xxx xx xxxxx xxxxx xxxx xxxxxxx xxxxx xxxx xxxx xxx xxx ''') def choose_action(self, s, visual_s, evaluation=False): if np.random.uniform() < self.expl_expt_mng.get_esp( self.train_step, evaluation=evaluation): a = np.random.randint(0, self.a_dim, self.n_agents) else: a, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = a.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) q = self.get_q(feat) # [B, A] return tf.argmax(q, axis=-1), cell_state # [B, 1] def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _update(): if self.global_step % self.assign_interval == 0: update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights) for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': self.train, 'update_function': _update, 'summary_dict': dict([['LEARNING_RATE/lr', self.lr(self.train_step)]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) indexs = tf.reshape(tf.range(batch_size), [-1, 1]) # [B, 1] q_dist = self.q_dist_net(feat) # [B, A, N] q_dist = tf.transpose( tf.reduce_sum(tf.transpose(q_dist, [2, 0, 1]) * a, axis=-1), [1, 0]) # [B, N] target_q_dist = self.q_target_dist_net(feat_) # [B, A, N] target_q = tf.reduce_sum(self.batch_quantiles * target_q_dist, axis=-1) # [B, A, N] => [B, A] a_ = tf.reshape( tf.cast(tf.argmax(target_q, axis=-1), dtype=tf.int32), [-1, 1]) # [B, 1] target_q_dist = tf.gather_nd(target_q_dist, tf.concat([indexs, a_], axis=-1)) # [B, N] target = tf.tile(r, tf.constant([1, self.nums])) \ + self.gamma * tf.multiply(self.quantiles, # [1, N] (1.0 - tf.tile(done, tf.constant([1, self.nums])))) # [B, N], [1, N]* [B, N] = [B, N] q_eval = tf.reduce_sum(q_dist * self.quantiles, axis=-1) # [B, 1] q_target = tf.reduce_sum(target * self.quantiles, axis=-1) # [B, 1] td_error = q_eval - q_target # [B, 1] quantile_error = tf.expand_dims( q_dist, axis=-1) - tf.expand_dims( target, axis=1) # [B, N, 1] - [B, 1, N] => [B, N, N] huber = huber_loss(quantile_error, delta=self.huber_delta) # [B, N, N] huber_abs = tf.abs( self.quantiles - tf.where(quantile_error < 0, tf.ones_like(quantile_error), tf.zeros_like(quantile_error)) ) # [1, N] - [B, N, N] => [B, N, N] loss = tf.reduce_mean(huber_abs * huber, axis=-1) # [B, N, N] => [B, N] loss = tf.reduce_sum(loss, axis=-1) # [B, N] => [B, ] loss = tf.reduce_mean(loss * isw) + crsty_loss # [B, ] => 1 grads = tape.gradient(loss, self.critic_tv) self.optimizer.apply_gradients(zip(grads, self.critic_tv)) self.global_step.assign_add(1) return td_error, dict( [['LOSS/loss', loss], ['Statistics/q_max', tf.reduce_max(q_eval)], ['Statistics/q_min', tf.reduce_min(q_eval)], ['Statistics/q_mean', tf.reduce_mean(q_eval)]]) @tf.function(experimental_relax_shapes=True) def get_q(self, feat): with tf.device(self.device): return tf.reduce_sum(self.batch_quantiles * self.q_dist_net(feat), axis=-1) # [B, A, N] => [B, A]
class IOC(make_off_policy_class(mode='share')): ''' Learning Options with Interest Functions, https://www.aaai.org/ojs/index.php/AAAI/article/view/5114/4987 Options of Interest: Temporal Abstraction with Interest Functions, http://arxiv.org/abs/2001.00271 ''' def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, q_lr=5.0e-3, intra_option_lr=5.0e-4, termination_lr=5.0e-4, interest_lr=5.0e-4, boltzmann_temperature=1.0, options_num=4, ent_coff=0.01, double_q=False, use_baseline=True, terminal_mask=True, termination_regularizer=0.01, assign_interval=1000, hidden_units={ 'q': [32, 32], 'intra_option': [32, 32], 'termination': [32, 32], 'interest': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.assign_interval = assign_interval self.options_num = options_num self.termination_regularizer = termination_regularizer self.ent_coff = ent_coff self.use_baseline = use_baseline self.terminal_mask = terminal_mask self.double_q = double_q self.boltzmann_temperature = boltzmann_temperature def _q_net(): return Critic(self.feat_dim, self.options_num, hidden_units['q']) self.q_net = _q_net() self.q_target_net = _q_net() self.intra_option_net = OptionNet(self.feat_dim, self.a_dim, self.options_num, hidden_units['intra_option']) self.termination_net = Critic(self.feat_dim, self.options_num, hidden_units['termination'], 'sigmoid') self.interest_net = Criticl(self.feat_dim, self.options_num, hidden_units['interest'], 'sigmoid') self.critic_tv = self.q_net.trainable_variables + self.other_tv self.actor_tv = self.intra_option_net.trainable_variables if self.is_continuous: self.log_std = tf.Variable(initial_value=-0.5 * np.ones( (self.options_num, self.a_dim), dtype=np.float32), trainable=True) # [P, A] self.actor_tv += [self.log_std] update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.q_lr, self.intra_option_lr, self.termination_lr, self.interest_lr = map( self.init_lr, [q_lr, intra_option_lr, termination_lr, interest_lr]) self.q_optimizer = self.init_optimizer(self.q_lr, clipvalue=5.) self.intra_option_optimizer = self.init_optimizer(self.intra_option_lr, clipvalue=5.) self.termination_optimizer = self.init_optimizer(self.termination_lr, clipvalue=5.) self.interest_optimizer = self.init_optimizer(self.interest_lr, clipvalue=5.) self.model_recorder( dict(q_net=self.q_net, intra_option_net=self.intra_option_net, termination_net=self.termination_net, interest_net=self.interest_net, q_optimizer=self.q_optimizer, intra_option_optimizer=self.intra_option_optimizer, termination_optimizer=self.termination_optimizer, interest_optimizer=self.interest_optimizer)) def show_logo(self): self.logger.info(''' xxxx xxxxxx xxxxxxx xx xxx xxxx xxxx xxx xx xxx xxx xxxx x xx xx xxx xxx x xx xx xxx xxx xx xx xxx xxx xx xx xxx xxx xx xxx xxx xxx x xxxx xxxxxxxx xxxxxxxx xxxxx xxxxx ''') def _generate_random_options(self): return tf.constant(np.random.randint(0, self.options_num, self.n_agents), dtype=tf.int32) def choose_action(self, s, visual_s, evaluation=False): if not hasattr(self, 'options'): self.options = self._generate_random_options() self.last_options = self.options a, self.options, self.cell_state = self._get_action( s, visual_s, self.cell_state, self.options) a = a.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state, options): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) q = self.q_net(feat) # [B, P] pi = self.intra_option_net(feat) # [B, P, A] options_onehot = tf.one_hot(options, self.options_num, dtype=tf.float32) # [B, P] options_onehot_expanded = tf.expand_dims(options_onehot, axis=-1) # [B, P, 1] pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1) # [B, A] if self.is_continuous: log_std = tf.gather(self.log_std, options) mu = tf.math.tanh(pi) a, _ = gaussian_clip_rsample(mu, log_std) else: pi = pi / self.boltzmann_temperature dist = tfp.distributions.Categorical(logits=pi) # [B, ] a = dist.sample() interests = self.interest_net(feat) # [B, P] op_logits = interests * q # [B, P] or tf.nn.softmax(q) new_options = tfp.distributions.Categorical( logits=op_logits).sample() return a, new_options, cell_state def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _update(): if self.global_step % self.assign_interval == 0: update_target_net_weights(self.q_target_net.weights, self.q_net.weights) for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': self.train, 'update_function': _update, 'sample_data_list': [ 's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'last_options', 'options' ], 'train_data_list': [ 'ss', 'vvss', 'a', 'r', 'done', 'last_options', 'options' ], 'summary_dict': dict([['LEARNING_RATE/q_lr', self.q_lr(self.train_step)], [ 'LEARNING_RATE/intra_option_lr', self.intra_option_lr(self.train_step) ], [ 'LEARNING_RATE/termination_lr', self.termination_lr(self.train_step) ], ['Statistics/option', self.options[0]]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done, last_options, options = memories last_options = tf.cast(last_options, tf.int32) options = tf.cast(options, tf.int32) with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) q = self.q_net(feat) # [B, P] pi = self.intra_option_net(feat) # [B, P, A] beta = self.termination_net(feat) # [B, P] q_next = self.q_target_net(feat_) # [B, P], [B, P, A], [B, P] beta_next = self.termination_net(feat_) # [B, P] interests = self.interest_net(feat) # [B, P] options_onehot = tf.one_hot(options, self.options_num, dtype=tf.float32) # [B,] => [B, P] q_s = qu_eval = tf.reduce_sum(q * options_onehot, axis=-1, keepdims=True) # [B, 1] beta_s_ = tf.reduce_sum(beta_next * options_onehot, axis=-1, keepdims=True) # [B, 1] q_s_ = tf.reduce_sum(q_next * options_onehot, axis=-1, keepdims=True) # [B, 1] if self.double_q: q_ = self.q_net(feat) # [B, P], [B, P, A], [B, P] max_a_idx = tf.one_hot( tf.argmax(q_, axis=-1), self.options_num, dtype=tf.float32) # [B, P] => [B, ] => [B, P] q_s_max = tf.reduce_sum(q_next * max_a_idx, axis=-1, keepdims=True) # [B, 1] else: q_s_max = tf.reduce_max(q_next, axis=-1, keepdims=True) # [B, 1] u_target = (1 - beta_s_) * q_s_ + beta_s_ * q_s_max # [B, 1] qu_target = tf.stop_gradient(r + self.gamma * (1 - done) * u_target) td_error = qu_target - qu_eval # gradient : q q_loss = tf.reduce_mean( tf.square(td_error) * isw) + crsty_loss # [B, 1] => 1 if self.use_baseline: adv = tf.stop_gradient(qu_target - qu_eval) else: adv = tf.stop_gradient(qu_target) options_onehot_expanded = tf.expand_dims( options_onehot, axis=-1) # [B, P] => [B, P, 1] pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1) # [B, P, A] => [B, A] if self.is_continuous: log_std = tf.gather(self.log_std, options) mu = tf.math.tanh(pi) log_p = gaussian_likelihood_sum(a, mu, log_std) entropy = gaussian_entropy(log_std) else: pi = pi / self.boltzmann_temperature log_pi = tf.nn.log_softmax(pi, axis=-1) # [B, A] entropy = -tf.reduce_sum(tf.exp(log_pi) * log_pi, axis=1, keepdims=True) # [B, 1] log_p = tf.reduce_sum(a * log_pi, axis=-1, keepdims=True) # [B, 1] pi_loss = tf.reduce_mean( -(log_p * adv + self.ent_coff * entropy) ) # [B, 1] * [B, 1] => [B, 1] => 1 last_options_onehot = tf.one_hot( last_options, self.options_num, dtype=tf.float32) # [B,] => [B, P] beta_s = tf.reduce_sum(beta * last_options_onehot, axis=-1, keepdims=True) # [B, 1] pi_op = tf.nn.softmax( interests * tf.stop_gradient(q)) # [B, P] or tf.nn.softmax(q) interest_loss = -tf.reduce_mean(beta_s * tf.reduce_sum( pi_op * options_onehot, axis=-1, keepdims=True) * q_s) # [B, 1] => 1 v_s = tf.reduce_sum(q * pi_op, axis=-1, keepdims=True) # [B, P] * [B, P] => [B, 1] beta_loss = beta_s * tf.stop_gradient(q_s - v_s) # [B, 1] if self.terminal_mask: beta_loss *= (1 - done) beta_loss = tf.reduce_mean(beta_loss) # [B, 1] => 1 q_grads = tape.gradient(q_loss, self.critic_tv) intra_option_grads = tape.gradient(pi_loss, self.actor_tv) termination_grads = tape.gradient( beta_loss, self.termination_net.trainable_variables) interest_grads = tape.gradient( interest_loss, self.interest_net.trainable_variables) self.q_optimizer.apply_gradients(zip(q_grads, self.critic_tv)) self.intra_option_optimizer.apply_gradients( zip(intra_option_grads, self.actor_tv)) self.termination_optimizer.apply_gradients( zip(termination_grads, self.termination_net.trainable_variables)) self.interest_optimizer.apply_gradients( zip(interest_grads, self.interest_net.trainable_variables)) self.global_step.assign_add(1) return td_error, dict( [['LOSS/q_loss', tf.reduce_mean(q_loss)], ['LOSS/pi_loss', tf.reduce_mean(pi_loss)], ['LOSS/beta_loss', tf.reduce_mean(beta_loss)], ['LOSS/interest_loss', tf.reduce_mean(interest_loss)], ['Statistics/q_option_max', tf.reduce_max(q_s)], ['Statistics/q_option_min', tf.reduce_min(q_s)], ['Statistics/q_option_mean', tf.reduce_mean(q_s)]]) def store_data(self, s, visual_s, a, r, s_, visual_s_, done): """ for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer. """ assert isinstance(a, np.ndarray), "store need action type is np.ndarray" assert isinstance(r, np.ndarray), "store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "store need done type is np.ndarray" self._running_average(s) self.data.add( s, visual_s, a, r[:, np.newaxis], # 升维 s_, visual_s_, done[:, np.newaxis], # 升维 self.last_options, self.options) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): pass
class OC(make_off_policy_class(mode='share')): ''' The Option-Critic Architecture. http://arxiv.org/abs/1609.05140 ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, q_lr=5.0e-3, intra_option_lr=5.0e-4, termination_lr=5.0e-4, use_eps_greedy=False, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, boltzmann_temperature=1.0, options_num=4, ent_coff=0.01, double_q=False, use_baseline=True, terminal_mask=True, termination_regularizer=0.01, assign_interval=1000, hidden_units={ 'q': [32, 32], 'intra_option': [32, 32], 'termination': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) self.assign_interval = assign_interval self.options_num = options_num self.termination_regularizer = termination_regularizer self.ent_coff = ent_coff self.use_baseline = use_baseline self.terminal_mask = terminal_mask self.double_q = double_q self.boltzmann_temperature = boltzmann_temperature self.use_eps_greedy = use_eps_greedy def _q_net(): return Critic(self.feat_dim, self.options_num, hidden_units['q']) self.q_net = _q_net() self.q_target_net = _q_net() self.intra_option_net = OptionNet(self.feat_dim, self.a_dim, self.options_num, hidden_units['intra_option']) self.termination_net = Critic(self.feat_dim, self.options_num, hidden_units['termination'], 'sigmoid') self.critic_tv = self.q_net.trainable_variables + self.other_tv self.actor_tv = self.intra_option_net.trainable_variables if self.is_continuous: self.log_std = tf.Variable(initial_value=-0.5 * np.ones( (self.options_num, self.a_dim), dtype=np.float32), trainable=True) # [P, A] self.actor_tv += [self.log_std] update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.q_lr, self.intra_option_lr, self.termination_lr = map( self.init_lr, [q_lr, intra_option_lr, termination_lr]) self.q_optimizer = self.init_optimizer(self.q_lr, clipvalue=5.) self.intra_option_optimizer = self.init_optimizer(self.intra_option_lr, clipvalue=5.) self.termination_optimizer = self.init_optimizer(self.termination_lr, clipvalue=5.) self.model_recorder( dict(q_net=self.q_net, intra_option_net=self.intra_option_net, termination_net=self.termination_net, q_optimizer=self.q_optimizer, intra_option_optimizer=self.intra_option_optimizer, termination_optimizer=self.termination_optimizer)) def show_logo(self): self.logger.info(''' xxxxxx xxxxxxx xxx xxxx xxxx xxx xxx xxx xxxx x xx xxx xxx x xx xxx xxx xx xxx xxx xx xxx xxx xxx xxx xxx x xxxxxxxx xxxxxxxx xxxxx xxxxx ''') def _generate_random_options(self): return tf.constant(np.random.randint(0, self.options_num, self.n_agents), dtype=tf.int32) def choose_action(self, s, visual_s, evaluation=False): if not hasattr(self, 'options'): self.options = self._generate_random_options() self.last_options = self.options a, self.options, self.cell_state = self._get_action( s, visual_s, self.cell_state, self.options) if self.use_eps_greedy: if np.random.uniform() < self.expl_expt_mng.get_esp( self.train_step, evaluation=evaluation): # epsilon greedy self.options = self._generate_random_options() a = a.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state, options): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) q = self.q_net(feat) # [B, P] pi = self.intra_option_net(feat) # [B, P, A] beta = self.termination_net(feat) # [B, P] options_onehot = tf.one_hot(options, self.options_num, dtype=tf.float32) # [B, P] options_onehot_expanded = tf.expand_dims(options_onehot, axis=-1) # [B, P, 1] pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1) # [B, A] if self.is_continuous: log_std = tf.gather(self.log_std, options) mu = tf.math.tanh(pi) a, _ = gaussian_clip_rsample(mu, log_std) else: pi = pi / self.boltzmann_temperature dist = tfp.distributions.Categorical(logits=pi) # [B, ] a = dist.sample() max_options = tf.cast(tf.argmax(q, axis=-1), dtype=tf.int32) # [B, P] => [B, ] if self.use_eps_greedy: new_options = max_options else: beta_probs = tf.reduce_sum(beta * options_onehot, axis=1) # [B, P] => [B,] beta_dist = tfp.distributions.Bernoulli(probs=beta_probs) new_options = tf.where(beta_dist.sample() < 1, options, max_options) return a, new_options, cell_state def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _update(): if self.global_step % self.assign_interval == 0: update_target_net_weights(self.q_target_net.weights, self.q_net.weights) for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': self.train, 'update_function': _update, 'sample_data_list': [ 's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'last_options', 'options' ], 'train_data_list': [ 'ss', 'vvss', 'a', 'r', 'done', 'last_options', 'options' ], 'summary_dict': dict([['LEARNING_RATE/q_lr', self.q_lr(self.train_step)], [ 'LEARNING_RATE/intra_option_lr', self.intra_option_lr(self.train_step) ], [ 'LEARNING_RATE/termination_lr', self.termination_lr(self.train_step) ], ['Statistics/option', self.options[0]]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done, last_options, options = memories last_options = tf.cast(last_options, tf.int32) options = tf.cast(options, tf.int32) with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) q = self.q_net(feat) # [B, P] pi = self.intra_option_net(feat) # [B, P, A] beta = self.termination_net(feat) # [B, P] q_next = self.q_target_net(feat_) # [B, P], [B, P, A], [B, P] beta_next = self.termination_net(feat_) # [B, P] options_onehot = tf.one_hot(options, self.options_num, dtype=tf.float32) # [B,] => [B, P] q_s = qu_eval = tf.reduce_sum(q * options_onehot, axis=-1, keepdims=True) # [B, 1] beta_s_ = tf.reduce_sum(beta_next * options_onehot, axis=-1, keepdims=True) # [B, 1] q_s_ = tf.reduce_sum(q_next * options_onehot, axis=-1, keepdims=True) # [B, 1] # https://github.com/jeanharb/option_critic/blob/5d6c81a650a8f452bc8ad3250f1f211d317fde8c/neural_net.py#L94 if self.double_q: q_ = self.q_net(feat) # [B, P], [B, P, A], [B, P] max_a_idx = tf.one_hot( tf.argmax(q_, axis=-1), self.options_num, dtype=tf.float32) # [B, P] => [B, ] => [B, P] q_s_max = tf.reduce_sum(q_next * max_a_idx, axis=-1, keepdims=True) # [B, 1] else: q_s_max = tf.reduce_max(q_next, axis=-1, keepdims=True) # [B, 1] u_target = (1 - beta_s_) * q_s_ + beta_s_ * q_s_max # [B, 1] qu_target = tf.stop_gradient(r + self.gamma * (1 - done) * u_target) td_error = qu_target - qu_eval # gradient : q q_loss = tf.reduce_mean( tf.square(td_error) * isw) + crsty_loss # [B, 1] => 1 # https://github.com/jeanharb/option_critic/blob/5d6c81a650a8f452bc8ad3250f1f211d317fde8c/neural_net.py#L130 if self.use_baseline: adv = tf.stop_gradient(qu_target - qu_eval) else: adv = tf.stop_gradient(qu_target) options_onehot_expanded = tf.expand_dims( options_onehot, axis=-1) # [B, P] => [B, P, 1] pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1) # [B, P, A] => [B, A] if self.is_continuous: log_std = tf.gather(self.log_std, options) mu = tf.math.tanh(pi) log_p = gaussian_likelihood_sum(a, mu, log_std) entropy = gaussian_entropy(log_std) else: pi = pi / self.boltzmann_temperature log_pi = tf.nn.log_softmax(pi, axis=-1) # [B, A] entropy = -tf.reduce_sum(tf.exp(log_pi) * log_pi, axis=1, keepdims=True) # [B, 1] log_p = tf.reduce_sum(a * log_pi, axis=-1, keepdims=True) # [B, 1] pi_loss = tf.reduce_mean( -(log_p * adv + self.ent_coff * entropy) ) # [B, 1] * [B, 1] => [B, 1] => 1 last_options_onehot = tf.one_hot( last_options, self.options_num, dtype=tf.float32) # [B,] => [B, P] beta_s = tf.reduce_sum(beta * last_options_onehot, axis=-1, keepdims=True) # [B, 1] if self.use_eps_greedy: v_s = tf.reduce_max( q, axis=-1, keepdims=True) - self.termination_regularizer # [B, 1] else: v_s = (1 - beta_s) * q_s + beta_s * tf.reduce_max( q, axis=-1, keepdims=True) # [B, 1] # v_s = tf.reduce_mean(q, axis=-1, keepdims=True) # [B, 1] beta_loss = beta_s * tf.stop_gradient(q_s - v_s) # [B, 1] # https://github.com/lweitkamp/option-critic-pytorch/blob/0c57da7686f8903ed2d8dded3fae832ee9defd1a/option_critic.py#L238 if self.terminal_mask: beta_loss *= (1 - done) beta_loss = tf.reduce_mean(beta_loss) # [B, 1] => 1 q_grads = tape.gradient(q_loss, self.critic_tv) intra_option_grads = tape.gradient(pi_loss, self.actor_tv) termination_grads = tape.gradient( beta_loss, self.termination_net.trainable_variables) self.q_optimizer.apply_gradients(zip(q_grads, self.critic_tv)) self.intra_option_optimizer.apply_gradients( zip(intra_option_grads, self.actor_tv)) self.termination_optimizer.apply_gradients( zip(termination_grads, self.termination_net.trainable_variables)) self.global_step.assign_add(1) return td_error, dict( [['LOSS/q_loss', tf.reduce_mean(q_loss)], ['LOSS/pi_loss', tf.reduce_mean(pi_loss)], ['LOSS/beta_loss', tf.reduce_mean(beta_loss)], ['Statistics/q_option_max', tf.reduce_max(q_s)], ['Statistics/q_option_min', tf.reduce_min(q_s)], ['Statistics/q_option_mean', tf.reduce_mean(q_s)]]) def store_data(self, s, visual_s, a, r, s_, visual_s_, done): """ for off-policy training, use this function to store <s, a, r, s_, done> into ReplayBuffer. """ assert isinstance(a, np.ndarray), "store need action type is np.ndarray" assert isinstance(r, np.ndarray), "store need reward type is np.ndarray" assert isinstance(done, np.ndarray), "store need done type is np.ndarray" self._running_average(s) self.data.add( s, visual_s, a, r[:, np.newaxis], # 升维 s_, visual_s_, done[:, np.newaxis], # 升维 self.last_options, self.options) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): pass
class IQN(make_off_policy_class(mode='share')): ''' Implicit Quantile Networks, https://arxiv.org/abs/1806.06923 Double DQN ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, online_quantiles=8, target_quantiles=8, select_quantiles=32, quantiles_idx=64, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, hidden_units={ 'q_net': [128, 64], 'quantile': [128, 64], 'tile': [64] }, **kwargs): assert not is_continuous, 'iqn only support discrete action space' super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.pi = tf.constant(np.pi) self.online_quantiles = online_quantiles self.target_quantiles = target_quantiles self.select_quantiles = select_quantiles self.quantiles_idx = quantiles_idx self.huber_delta = huber_delta self.assign_interval = assign_interval self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self.max_train_step) def _net(): return NetWork(self.feat_dim, self.a_dim, self.quantiles_idx, hidden_units) self.q_net = _net() self.q_target_net = _net() self.critic_tv = self.q_net.trainable_variables + self.other_tv update_target_net_weights(self.q_target_net.weights, self.q_net.weights) self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict( model=self.q_net, optimizer=self.optimizer )) def show_logo(self): self.logger.info(''' xxxxxxxx xxxxxxx xxx xxx xxxxxxxx xxxxxxxxx xxxx xxx xxx xxxx xxxx xxxxx xxx xxx xxx xxx xxxxx xxx xxx xxxx xxx xxxxxx xxx xxx xxxx xxx xxxxxxxxxx xxx xxxx xxx xxx xxxxxx xxx xxxx xxxx xxx xxxxxx xxxxxxxx xxxxxxxxx xxx xxxxx xxxxxxxx xxxxxxx xxx xxxx xxxx xxxx xxxx ''') def choose_action(self, s, visual_s, evaluation=False): if np.random.uniform() < self.expl_expt_mng.get_esp(self.train_step, evaluation=evaluation): a = np.random.randint(0, self.a_dim, self.n_agents) else: a, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = a.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): batch_size = tf.shape(s)[0] with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) _, select_quantiles_tiled = self._generate_quantiles( # [N*B, 64] batch_size=batch_size, quantiles_num=self.select_quantiles, quantiles_idx=self.quantiles_idx ) _, q_values = self.q_net(feat, select_quantiles_tiled, quantiles_num=self.select_quantiles) # [B, A] return tf.argmax(q_values, axis=-1), cell_state # [B,] @tf.function def _generate_quantiles(self, batch_size, quantiles_num, quantiles_idx): with tf.device(self.device): _quantiles = tf.random.uniform([batch_size * quantiles_num, 1], minval=0, maxval=1) # [N*B, 1] _quantiles_tiled = tf.tile(_quantiles, [1, quantiles_idx]) # [N*B, 1] => [N*B, 64] _quantiles_tiled = tf.cast(tf.range(quantiles_idx), tf.float32) * self.pi * _quantiles_tiled # pi * i * tau [N*B, 64] * [64, ] => [N*B, 64] _quantiles_tiled = tf.cos(_quantiles_tiled) # [N*B, 64] _quantiles = tf.reshape(_quantiles, [batch_size, quantiles_num, 1]) # [N*B, 1] => [B, N, 1] return _quantiles, _quantiles_tiled def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _update(): if self.global_step % self.assign_interval == 0: update_target_net_weights(self.q_target_net.weights, self.q_net.weights) for i in range(self.train_times_per_step): self._learn(function_dict={ 'train_function': self.train, 'update_function': _update, 'summary_dict': dict([['LEARNING_RATE/lr', self.lr(self.train_step)]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done = memories batch_size = tf.shape(a)[0] with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) quantiles, quantiles_tiled = self._generate_quantiles( # [B, N, 1], [N*B, 64] batch_size=batch_size, quantiles_num=self.online_quantiles, quantiles_idx=self.quantiles_idx ) quantiles_value, q = self.q_net(feat, quantiles_tiled, quantiles_num=self.online_quantiles) # [N, B, A], [B, A] _a = tf.reshape(tf.tile(a, [self.online_quantiles, 1]), [self.online_quantiles, -1, self.a_dim]) # [B, A] => [N*B, A] => [N, B, A] quantiles_value = tf.reduce_sum(quantiles_value * _a, axis=-1, keepdims=True) # [N, B, A] => [N, B, 1] q_eval = tf.reduce_sum(q * a, axis=-1, keepdims=True) # [B, A] => [B, 1] _, select_quantiles_tiled = self._generate_quantiles( # [N*B, 64] batch_size=batch_size, quantiles_num=self.select_quantiles, quantiles_idx=self.quantiles_idx ) _, q_values = self.q_net(feat_, select_quantiles_tiled, quantiles_num=self.select_quantiles) # [B, A] next_max_action = tf.argmax(q_values, axis=-1) # [B,] next_max_action = tf.one_hot(tf.squeeze(next_max_action), self.a_dim, 1., 0., dtype=tf.float32) # [B, A] _next_max_action = tf.reshape(tf.tile(next_max_action, [self.target_quantiles, 1]), [self.target_quantiles, -1, self.a_dim]) # [B, A] => [N'*B, A] => [N', B, A] _, target_quantiles_tiled = self._generate_quantiles( # [N'*B, 64] batch_size=batch_size, quantiles_num=self.target_quantiles, quantiles_idx=self.quantiles_idx ) target_quantiles_value, target_q = self.q_target_net(feat_, target_quantiles_tiled, quantiles_num=self.target_quantiles) # [N', B, A], [B, A] target_quantiles_value = tf.reduce_sum(target_quantiles_value * _next_max_action, axis=-1, keepdims=True) # [N', B, A] => [N', B, 1] target_q = tf.reduce_sum(target_q * a, axis=-1, keepdims=True) # [B, A] => [B, 1] q_target = tf.stop_gradient(r + self.gamma * (1 - done) * target_q) # [B, 1] td_error = q_eval - q_target # [B, 1] _r = tf.reshape(tf.tile(r, [self.target_quantiles, 1]), [self.target_quantiles, -1, 1]) # [B, 1] => [N'*B, 1] => [N', B, 1] _done = tf.reshape(tf.tile(done, [self.target_quantiles, 1]), [self.target_quantiles, -1, 1]) # [B, 1] => [N'*B, 1] => [N', B, 1] quantiles_value_target = tf.stop_gradient(_r + self.gamma * (1 - _done) * target_quantiles_value) # [N', B, 1] quantiles_value_target = tf.transpose(quantiles_value_target, [1, 2, 0]) # [B, 1, N'] quantiles_value_online = tf.transpose(quantiles_value, [1, 0, 2]) # [B, N, 1] quantile_error = quantiles_value_online - quantiles_value_target # [B, N, 1] - [B, 1, N'] => [B, N, N'] huber = huber_loss(quantile_error, delta=self.huber_delta) # [B, N, N'] huber_abs = tf.abs(quantiles - tf.where(quantile_error < 0, tf.ones_like(quantile_error), tf.zeros_like(quantile_error))) # [B, N, 1] - [B, N, N'] => [B, N, N'] loss = tf.reduce_mean(huber_abs * huber, axis=-1) # [B, N, N'] => [B, N] loss = tf.reduce_sum(loss, axis=-1) # [B, N] => [B, ] loss = tf.reduce_mean(loss * isw) + crsty_loss # [B, ] => 1 grads = tape.gradient(loss, self.critic_tv) self.optimizer.apply_gradients( zip(grads, self.critic_tv) ) self.global_step.assign_add(1) return td_error, dict([ ['LOSS/loss', loss], ['Statistics/q_max', tf.reduce_max(q_eval)], ['Statistics/q_min', tf.reduce_min(q_eval)], ['Statistics/q_mean', tf.reduce_mean(q_eval)] ])
class AC(make_off_policy_class(mode='share')): # off-policy actor-critic def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, actor_lr=5.0e-4, critic_lr=1.0e-3, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) if self.is_continuous: self.actor_net = ActorCts(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) self.log_std = tf.Variable(initial_value=-0.5 * np.ones(self.a_dim, dtype=np.float32), trainable=True) self.actor_tv = self.actor_net.trainable_variables + [self.log_std] else: self.actor_net = ActorDcs(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.actor_tv = self.actor_net.trainable_variables self.critic_net = Critic(self.feat_dim, self.a_dim, hidden_units['critic']) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder( dict(actor=self.actor_net, critic=self.critic_net, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic)) def show_logo(self): self.logger.info(''' xx xxxxxx xxx xxx xx xxx xx xx x xx xx xx xx xxx xxxxxx xxx xx xx xx xx xx xx xxx xxx xxx xxxxx xxxxxx ''') def choose_action(self, s, visual_s, evaluation=False): a, _lp, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = a.numpy() self._log_prob = _lp.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) if self.is_continuous: mu = self.actor_net(feat) sample_op, _ = gaussian_clip_rsample(mu, self.log_std) log_prob = gaussian_likelihood_sum(sample_op, mu, self.log_std) else: logits = self.actor_net(feat) norm_dist = tfp.distributions.Categorical(logits) sample_op = norm_dist.sample() log_prob = norm_dist.log_prob(sample_op) return sample_op, log_prob, cell_state def store_data(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance( a, np.ndarray), "store_data need action type is np.ndarray" assert isinstance( r, np.ndarray), "store_data need reward type is np.ndarray" assert isinstance( done, np.ndarray), "store_data need done type is np.ndarray" self._running_average(s) old_log_prob = self._log_prob self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_, done[:, np.newaxis], old_log_prob) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance( a, np.ndarray), "store_data need action type is np.ndarray" assert isinstance( r, np.ndarray), "store_data need reward type is np.ndarray" assert isinstance( done, np.ndarray), "store_data need done type is np.ndarray" self._running_average(s) old_log_prob = np.ones_like(r) self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_, done[:, np.newaxis], old_log_prob[:, np.newaxis]) def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): self._learn( function_dict={ 'train_function': self.train, 'update_function': lambda: None, 'summary_dict': dict([[ 'LEARNING_RATE/actor_lr', self.actor_lr(self.train_step) ], [ 'LEARNING_RATE/critic_lr', self.critic_lr(self.train_step) ]]), 'sample_data_list': [ 's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'old_log_prob' ], 'train_data_list': ['ss', 'vvss', 'a', 'r', 'done', 'old_log_prob'] }) @tf.function(experimental_relax_shapes=True) def train(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done, old_log_prob = memories with tf.device(self.device): with tf.GradientTape() as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: next_mu = self.actor_net(feat_) max_q_next = tf.stop_gradient( self.critic_net(feat_, next_mu)) else: logits = self.actor_net(feat_) max_a = tf.argmax(logits, axis=1) max_a_one_hot = tf.one_hot(max_a, self.a_dim, dtype=tf.float32) max_q_next = tf.stop_gradient( self.critic_net(feat_, max_a_one_hot)) q = self.critic_net(feat, a) td_error = q - (r + self.gamma * (1 - done) * max_q_next) critic_loss = tf.reduce_mean( tf.square(td_error) * isw) + crsty_loss critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) with tf.GradientTape() as tape: if self.is_continuous: mu = self.actor_net(feat) log_prob = gaussian_likelihood_sum(a, mu, self.log_std) entropy = gaussian_entropy(self.log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) log_prob = tf.reduce_sum(tf.multiply(logp_all, a), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) q = self.critic_net(feat, a) ratio = tf.stop_gradient(tf.exp(log_prob - old_log_prob)) q_value = tf.stop_gradient(q) actor_loss = -tf.reduce_mean(ratio * log_prob * q_value) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) self.global_step.assign_add(1) return td_error, dict([['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/q_max', tf.reduce_max(q)], ['Statistics/q_min', tf.reduce_min(q)], ['Statistics/q_mean', tf.reduce_mean(q)], ['Statistics/ratio', tf.reduce_mean(ratio)], ['Statistics/entropy', entropy]]) @tf.function(experimental_relax_shapes=True) def train_persistent(self, memories, isw, crsty_loss, cell_state): ss, vvss, a, r, done, old_log_prob = memories with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat, feat_ = self.get_feature(ss, vvss, cell_state=cell_state, s_and_s_=True) if self.is_continuous: next_mu = self.actor_net(feat_) max_q_next = tf.stop_gradient( self.critic_net(feat_, next_mu)) mu, sigma = self.actor_net(feat) log_prob = gaussian_likelihood_sum(a, mu, self.log_std) entropy = gaussian_entropy(self.log_std) else: logits = self.actor_net(feat_) max_a = tf.argmax(logits, axis=1) max_a_one_hot = tf.one_hot(max_a, self.a_dim) max_q_next = tf.stop_gradient( self.critic_net(feat_, max_a_one_hot)) logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) log_prob = tf.reduce_sum(tf.multiply(logp_all, a), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) q = self.critic_net(feat, a) ratio = tf.stop_gradient(tf.exp(log_prob - old_log_prob)) q_value = tf.stop_gradient(q) td_error = q - (r + self.gamma * (1 - done) * max_q_next) critic_loss = tf.reduce_mean( tf.square(td_error) * isw) + crsty_loss actor_loss = -tf.reduce_mean(ratio * log_prob * q_value) critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) self.global_step.assign_add(1) return td_error, dict([['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/q_max', tf.reduce_max(q)], ['Statistics/q_min', tf.reduce_min(q)], ['Statistics/q_mean', tf.reduce_mean(q)], ['Statistics/ratio', tf.reduce_mean(ratio)], ['Statistics/entropy', entropy]])