def __init__(self, envspec, actor_lr=5.0e-4, critic_lr=1.0e-3, use_target_action_noise=False, gaussian_noise_sigma=0.2, gaussian_noise_bound=0.2, discrete_tau=1.0, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__(envspec=envspec, **kwargs) self.discrete_tau = discrete_tau self.use_target_action_noise = use_target_action_noise self.gaussian_noise_sigma = gaussian_noise_sigma self.gaussian_noise_bound = gaussian_noise_bound if self.is_continuous: self.target_noised_action = ClippedNormalNoisedAction( sigma=self.gaussian_noise_sigma, noise_bound=self.gaussian_noise_bound) self.noised_action = OrnsteinUhlenbeckNoisedAction(sigma=0.2) self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_DPG, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_continuous']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict(action_dim=self.a_dim, network_settings=network_settings['q'])) else: self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_DCT, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_discrete']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict(action_dim=self.a_dim, network_settings=network_settings['q'])) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self._worker_params_dict.update(self.net._policy_models) self._all_params_dict.update(self.net._all_models) self._all_params_dict.update(optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic) self._model_post_process()
def __init__( self, envspec, epoch=5, beta=1.0e-3, actor_lr=5.0e-4, critic_lr=1.0e-3, condition_sigma: bool = False, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__(envspec=envspec, **kwargs) self.beta = beta self.epoch = epoch if self.is_continuous: self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_MU_LOGSTD, policy_net_kwargs=dict( output_shape=self.a_dim, condition_sigma=condition_sigma, network_settings=network_settings['actor_continuous']), value_net_type=OutputNetworkType.CRITIC_VALUE, value_net_kwargs=dict( network_settings=network_settings['critic'])) else: self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_DCT, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_discrete']), value_net_type=OutputNetworkType.CRITIC_VALUE, value_net_kwargs=dict( network_settings=network_settings['critic'])) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.initialize_data_buffer( sample_data_type=A2C_Train_BatchExperiences) self._worker_params_dict.update(self.net._policy_models) self._all_params_dict.update(self.net._all_models) self._all_params_dict.update(optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic) self._model_post_process()
def __init__(self, envspec, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__(envspec=envspec, **kwargs) self.discrete_tau = discrete_tau if self.is_continuous: # self.action_noise = NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim)) self.action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.a_dim), sigma=0.2 * np.ones(self.a_dim)) self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_DPG, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_continuous']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict(action_dim=self.a_dim, network_settings=network_settings['q'])) else: self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_DCT, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_discrete']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict(action_dim=self.a_dim, network_settings=network_settings['q'])) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self._worker_params_dict.update(self.net._policy_models) self._all_params_dict.update(self.net._all_models) self._all_params_dict.update(optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic) self._model_post_process()
def _create_net(name, representation_net=None): return ACNetwork( name=name, representation_net=representation_net, policy_net_type=OutputNetworkType.ACTOR_DCT, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_discrete']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict( action_dim=self.a_dim, network_settings=network_settings['q']))
def __init__( self, envspec, policy_epoch: int = 4, value_epoch: int = 4, ent_coef: float = 1.0e-2, vf_coef: float = 0.5, lr: float = 5.0e-4, lambda_: float = 0.95, epsilon: float = 0.2, use_duel_clip: bool = False, duel_epsilon: float = 0., use_vclip: bool = False, value_epsilon: float = 0.2, share_net: bool = True, actor_lr: float = 3e-4, critic_lr: float = 1e-3, max_grad_norm: float = 0.5, condition_sigma: bool = False, kl_reverse: bool = False, kl_target: float = 0.02, kl_target_cutoff: float = 2, kl_target_earlystop: float = 4, kl_beta: List[float] = [0.7, 1.3], kl_alpha: float = 1.5, kl_coef: float = 1.0, extra_coef: float = 1000.0, use_kl_loss: bool = False, use_extra_loss: bool = False, use_early_stop: bool = False, network_settings: Dict = { 'share': { 'continuous': { 'share': [32, 32], 'mu': [32, 32], 'v': [32, 32] }, 'discrete': { 'share': [32, 32], 'logits': [32, 32], 'v': [32, 32] } }, 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__(envspec=envspec, **kwargs) self.ent_coef = ent_coef self.policy_epoch = policy_epoch self.value_epoch = value_epoch self.lambda_ = lambda_ assert 0.0 <= lambda_ <= 1.0, "GAE lambda should be in [0, 1]." self.epsilon = epsilon self.use_vclip = use_vclip self.value_epsilon = value_epsilon self.share_net = share_net self.kl_reverse = kl_reverse self.kl_target = kl_target self.kl_alpha = kl_alpha self.kl_coef = tf.constant(kl_coef, dtype=tf.float32) self.extra_coef = extra_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.use_duel_clip = use_duel_clip self.duel_epsilon = duel_epsilon if self.use_duel_clip: assert -self.epsilon < self.duel_epsilon < self.epsilon, "duel_epsilon should be set in the range of (-epsilon, epsilon)." self.kl_cutoff = kl_target * kl_target_cutoff self.kl_stop = kl_target * kl_target_earlystop self.kl_low = kl_target * kl_beta[0] self.kl_high = kl_target * kl_beta[-1] self.use_kl_loss = use_kl_loss self.use_extra_loss = use_extra_loss self.use_early_stop = use_early_stop if self.share_net: if self.is_continuous: self.net = ValueNetwork( name='net', representation_net=self._representation_net, value_net_type=OutputNetworkType.ACTOR_CRITIC_VALUE_CTS, value_net_kwargs=dict( output_shape=self.a_dim, condition_sigma=condition_sigma, network_settings=network_settings['share'] ['continuous'])) else: self.net = ValueNetwork( name='net', representation_net=self._representation_net, value_net_type=OutputNetworkType.ACTOR_CRITIC_VALUE_DCT, value_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['share'] ['discrete'])) self.lr = self.init_lr(lr) if self.max_grad_norm is not None: self.optimizer = self.init_optimizer( self.lr, clipnorm=self.max_grad_norm) else: self.optimizer = self.init_optimizer(self.lr) self._all_params_dict.update(optimizer=self.optimizer) else: if self.is_continuous: self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_MU_LOGSTD, policy_net_kwargs=dict( output_shape=self.a_dim, condition_sigma=condition_sigma, network_settings=network_settings['actor_continuous']), value_net_type=OutputNetworkType.CRITIC_VALUE, value_net_kwargs=dict( network_settings=network_settings['critic'])) else: self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_DCT, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_discrete']), value_net_type=OutputNetworkType.CRITIC_VALUE, value_net_kwargs=dict( network_settings=network_settings['critic'])) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) if self.max_grad_norm is not None: self.optimizer_actor = self.init_optimizer( self.actor_lr, clipnorm=self.max_grad_norm) self.optimizer_critic = self.init_optimizer( self.critic_lr, clipnorm=self.max_grad_norm) else: self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self._all_params_dict.update( optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic) self._worker_params_dict.update(self.net._policy_models) self._all_params_dict.update(self.net._all_models) self.initialize_data_buffer( store_data_type=PPO_Store_BatchExperiences, sample_data_type=PPO_Train_BatchExperiences) self._model_post_process()
class PPO(On_Policy): ''' Proximal Policy Optimization, https://arxiv.org/abs/1707.06347 Emergence of Locomotion Behaviours in Rich Environments, http://arxiv.org/abs/1707.02286, DPPO ''' def __init__( self, envspec, policy_epoch: int = 4, value_epoch: int = 4, ent_coef: float = 1.0e-2, vf_coef: float = 0.5, lr: float = 5.0e-4, lambda_: float = 0.95, epsilon: float = 0.2, use_duel_clip: bool = False, duel_epsilon: float = 0., use_vclip: bool = False, value_epsilon: float = 0.2, share_net: bool = True, actor_lr: float = 3e-4, critic_lr: float = 1e-3, max_grad_norm: float = 0.5, condition_sigma: bool = False, kl_reverse: bool = False, kl_target: float = 0.02, kl_target_cutoff: float = 2, kl_target_earlystop: float = 4, kl_beta: List[float] = [0.7, 1.3], kl_alpha: float = 1.5, kl_coef: float = 1.0, extra_coef: float = 1000.0, use_kl_loss: bool = False, use_extra_loss: bool = False, use_early_stop: bool = False, network_settings: Dict = { 'share': { 'continuous': { 'share': [32, 32], 'mu': [32, 32], 'v': [32, 32] }, 'discrete': { 'share': [32, 32], 'logits': [32, 32], 'v': [32, 32] } }, 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__(envspec=envspec, **kwargs) self.ent_coef = ent_coef self.policy_epoch = policy_epoch self.value_epoch = value_epoch self.lambda_ = lambda_ assert 0.0 <= lambda_ <= 1.0, "GAE lambda should be in [0, 1]." self.epsilon = epsilon self.use_vclip = use_vclip self.value_epsilon = value_epsilon self.share_net = share_net self.kl_reverse = kl_reverse self.kl_target = kl_target self.kl_alpha = kl_alpha self.kl_coef = tf.constant(kl_coef, dtype=tf.float32) self.extra_coef = extra_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.use_duel_clip = use_duel_clip self.duel_epsilon = duel_epsilon if self.use_duel_clip: assert -self.epsilon < self.duel_epsilon < self.epsilon, "duel_epsilon should be set in the range of (-epsilon, epsilon)." self.kl_cutoff = kl_target * kl_target_cutoff self.kl_stop = kl_target * kl_target_earlystop self.kl_low = kl_target * kl_beta[0] self.kl_high = kl_target * kl_beta[-1] self.use_kl_loss = use_kl_loss self.use_extra_loss = use_extra_loss self.use_early_stop = use_early_stop if self.share_net: if self.is_continuous: self.net = ValueNetwork( name='net', representation_net=self._representation_net, value_net_type=OutputNetworkType.ACTOR_CRITIC_VALUE_CTS, value_net_kwargs=dict( output_shape=self.a_dim, condition_sigma=condition_sigma, network_settings=network_settings['share'] ['continuous'])) else: self.net = ValueNetwork( name='net', representation_net=self._representation_net, value_net_type=OutputNetworkType.ACTOR_CRITIC_VALUE_DCT, value_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['share'] ['discrete'])) self.lr = self.init_lr(lr) if self.max_grad_norm is not None: self.optimizer = self.init_optimizer( self.lr, clipnorm=self.max_grad_norm) else: self.optimizer = self.init_optimizer(self.lr) self._all_params_dict.update(optimizer=self.optimizer) else: if self.is_continuous: self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_MU_LOGSTD, policy_net_kwargs=dict( output_shape=self.a_dim, condition_sigma=condition_sigma, network_settings=network_settings['actor_continuous']), value_net_type=OutputNetworkType.CRITIC_VALUE, value_net_kwargs=dict( network_settings=network_settings['critic'])) else: self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_DCT, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_discrete']), value_net_type=OutputNetworkType.CRITIC_VALUE, value_net_kwargs=dict( network_settings=network_settings['critic'])) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) if self.max_grad_norm is not None: self.optimizer_actor = self.init_optimizer( self.actor_lr, clipnorm=self.max_grad_norm) self.optimizer_critic = self.init_optimizer( self.critic_lr, clipnorm=self.max_grad_norm) else: self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self._all_params_dict.update( optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic) self._worker_params_dict.update(self.net._policy_models) self._all_params_dict.update(self.net._all_models) self.initialize_data_buffer( store_data_type=PPO_Store_BatchExperiences, sample_data_type=PPO_Train_BatchExperiences) self._model_post_process() def choose_action(self, obs, evaluation: bool = False) -> np.ndarray: a, value, log_prob, self.next_cell_state = self._get_action( obs, self.cell_state) a = a.numpy() self._value = value.numpy() self._log_prob = log_prob.numpy() + 1e-10 return a @tf.function def _get_action(self, obs, cell_state): with tf.device(self.device): feat, cell_state = self._representation_net(obs, cell_state=cell_state) if self.is_continuous: if self.share_net: mu, log_std, value = self.net.value_net(feat) else: mu, log_std = self.net.policy_net(feat) value = self.net.value_net(feat) sample_op, _ = gaussian_clip_rsample(mu, log_std) log_prob = gaussian_likelihood_sum(sample_op, mu, log_std) else: if self.share_net: logits, value = self.net.value_net(feat) else: logits = self.net.policy_net(feat) value = self.net.value_net(feat) norm_dist = tfp.distributions.Categorical( logits=tf.nn.log_softmax(logits)) sample_op = norm_dist.sample() log_prob = norm_dist.log_prob(sample_op) return sample_op, value, log_prob, cell_state def store_data(self, exps: BatchExperiences) -> NoReturn: # self._running_average() self.data.add( PPO_Store_BatchExperiences(*exps, self._value, self._log_prob)) if self.use_rnn: self.data.add_cell_state( tuple(cs.numpy() for cs in self.cell_state)) self.cell_state = self.next_cell_state @tf.function def _get_value(self, obs, cell_state): with tf.device(self.device): feat, cell_state = self._representation_net(obs, cell_state=cell_state) output = self.net.value_net(feat) if self.is_continuous: if self.share_net: _, _, value = output else: value = output else: if self.share_net: _, value = output else: value = output return value, cell_state def calculate_statistics(self) -> NoReturn: init_value, self.cell_state = self._get_value( self.data.last_data('obs_'), cell_state=self.cell_state) init_value = init_value.numpy() self.data.cal_dc_r(self.gamma, init_value) self.data.cal_td_error(self.gamma, init_value) self.data.cal_gae_adv(self.lambda_, self.gamma, normalize=True) # @show_graph(name='ppo_net') def learn(self, **kwargs) -> NoReturn: self.train_step = kwargs.get('train_step') def _train(data, cell_state): early_step = 0 if self.share_net: for i in range(self.policy_epoch): actor_loss, critic_loss, entropy, kl = self.train_share( data, cell_state, self.kl_coef) if self.use_early_stop and kl > self.kl_stop: early_step = i break else: for i in range(self.policy_epoch): actor_loss, entropy, kl = self.train_actor( data, cell_state, self.kl_coef) if self.use_early_stop and kl > self.kl_stop: early_step = i break for _ in range(self.value_epoch): critic_loss = self.train_critic(data, cell_state) summaries = dict([['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/kl', kl], ['Statistics/entropy', entropy]]) if self.use_early_stop: summaries.update(dict([['Statistics/early_step', early_step]])) if self.use_kl_loss: # ref: https://github.com/joschu/modular_rl/blob/6970cde3da265cf2a98537250fea5e0c0d9a7639/modular_rl/ppo.py#L93 if kl > self.kl_high: self.kl_coef *= self.kl_alpha elif kl < self.kl_low: self.kl_coef /= self.kl_alpha summaries.update(dict([['Statistics/kl_coef', self.kl_coef]])) return summaries if self.share_net: summary_dict = dict( [['LEARNING_RATE/lr', self.lr(self.train_step)]]) else: summary_dict = dict( [['LEARNING_RATE/actor_lr', self.actor_lr(self.train_step)], ['LEARNING_RATE/critic_lr', self.critic_lr(self.train_step)]]) self._learn( function_dict={ 'calculate_statistics': self.calculate_statistics, 'train_function': _train, 'summary_dict': summary_dict, 'train_data_type': PPO_Train_BatchExperiences }) @tf.function def train_share(self, BATCH, cell_state, kl_coef): with tf.device(self.device): with tf.GradientTape() as tape: output, cell_state = self.net(BATCH.obs, cell_state=cell_state) if self.is_continuous: mu, log_std, value = output new_log_prob = gaussian_likelihood_sum( BATCH.action, mu, log_std) entropy = gaussian_entropy(log_std) else: logits, value = output logp_all = tf.nn.log_softmax(logits) new_log_prob = tf.reduce_sum(BATCH.action * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) ratio = tf.exp(new_log_prob - BATCH.log_prob) surrogate = ratio * BATCH.gae_adv clipped_surrogate = tf.minimum( surrogate, tf.clip_by_value(ratio, 1.0 - self.epsilon, 1.0 + self.epsilon) * BATCH.gae_adv) # ref: https://github.com/thu-ml/tianshou/blob/c97aa4065ee8464bd5897bb86f1f81abd8e2cff9/tianshou/policy/modelfree/ppo.py#L159 if self.use_duel_clip: clipped_surrogate = tf.maximum(clipped_surrogate, (1.0 + self.duel_epsilon) * BATCH.gae_adv) actor_loss = -(tf.reduce_mean(clipped_surrogate) + self.ent_coef * entropy) # ref: https://github.com/joschu/modular_rl/blob/6970cde3da265cf2a98537250fea5e0c0d9a7639/modular_rl/ppo.py#L40 # ref: https://github.com/hill-a/stable-baselines/blob/b3f414f4f2900403107357a2206f80868af16da3/stable_baselines/ppo2/ppo2.py#L185 if self.kl_reverse: kl = .5 * tf.reduce_mean( tf.square(new_log_prob - BATCH.log_prob)) else: kl = .5 * tf.reduce_mean( tf.square(BATCH.log_prob - new_log_prob) ) # a sample estimate for KL-divergence, easy to compute td_error = BATCH.discounted_reward - value if self.use_vclip: # ref: https://github.com/llSourcell/OpenAI_Five_vs_Dota2_Explained/blob/c5def7e57aa70785c2394ea2eeb3e5f66ad59a53/train.py#L154 # ref: https://github.com/hill-a/stable-baselines/blob/b3f414f4f2900403107357a2206f80868af16da3/stable_baselines/ppo2/ppo2.py#L172 value_clip = BATCH.value + tf.clip_by_value( value - BATCH.value, -self.value_epsilon, self.value_epsilon) td_error_clip = BATCH.discounted_reward - value_clip td_square = tf.maximum(tf.square(td_error), tf.square(td_error_clip)) else: td_square = tf.square(td_error) if self.use_kl_loss: kl_loss = kl_coef * kl actor_loss += kl_loss if self.use_extra_loss: extra_loss = self.extra_coef * tf.square( tf.maximum(0., kl - self.kl_cutoff)) actor_loss += extra_loss value_loss = 0.5 * tf.reduce_mean(td_square) loss = actor_loss + self.vf_coef * value_loss loss_grads = tape.gradient(loss, self.net.trainable_variables) self.optimizer.apply_gradients( zip(loss_grads, self.net.trainable_variables)) self.global_step.assign_add(1) return actor_loss, value_loss, entropy, kl @tf.function def train_actor(self, BATCH, cell_state, kl_coef): with tf.device(self.device): with tf.GradientTape() as tape: output, _ = self.net(BATCH.obs, cell_state=cell_state) if self.is_continuous: mu, log_std = output new_log_prob = gaussian_likelihood_sum( BATCH.action, mu, log_std) entropy = gaussian_entropy(log_std) else: logits = output logp_all = tf.nn.log_softmax(logits) new_log_prob = tf.reduce_sum(BATCH.action * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) ratio = tf.exp(new_log_prob - BATCH.log_prob) kl = tf.reduce_mean(BATCH.log_prob - new_log_prob) surrogate = ratio * BATCH.gae_adv clipped_surrogate = tf.minimum( surrogate, tf.where(BATCH.gae_adv > 0, (1 + self.epsilon) * BATCH.gae_adv, (1 - self.epsilon) * BATCH.gae_adv)) if self.use_duel_clip: clipped_surrogate = tf.maximum(clipped_surrogate, (1.0 + self.duel_epsilon) * BATCH.gae_adv) actor_loss = -(tf.reduce_mean(clipped_surrogate) + self.ent_coef * entropy) if self.use_kl_loss: kl_loss = kl_coef * kl actor_loss += kl_loss if self.use_extra_loss: extra_loss = self.extra_coef * tf.square( tf.maximum(0., kl - self.kl_cutoff)) actor_loss += extra_loss actor_grads = tape.gradient(actor_loss, self.net.actor_trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.net.actor_trainable_variables)) self.global_step.assign_add(1) return actor_loss, entropy, kl @tf.function def train_critic(self, BATCH, cell_state): with tf.device(self.device): with tf.GradientTape() as tape: feat, _ = self._representation_net(BATCH.obs, cell_state=cell_state) value = self.net.value_net(feat) td_error = BATCH.discounted_reward - value if self.use_vclip: value_clip = BATCH.value + tf.clip_by_value( value - BATCH.value, -self.value_epsilon, self.value_epsilon) td_error_clip = BATCH.discounted_reward - value_clip td_square = tf.maximum(tf.square(td_error), tf.square(td_error_clip)) else: td_square = tf.square(td_error) value_loss = 0.5 * tf.reduce_mean(td_square) critic_grads = tape.gradient(value_loss, self.net.critic_trainable_variables) self.optimizer_critic.apply_gradients( zip(critic_grads, self.net.critic_trainable_variables)) return value_loss
class AC(Off_Policy): # off-policy actor-critic def __init__( self, envspec, actor_lr=5.0e-4, critic_lr=1.0e-3, condition_sigma: bool = False, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__(envspec=envspec, **kwargs) if self.is_continuous: self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_MU_LOGSTD, policy_net_kwargs=dict( output_shape=self.a_dim, condition_sigma=condition_sigma, network_settings=network_settings['actor_continuous']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict( action_dim=self.a_dim, network_settings=network_settings['critic'])) else: self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_DCT, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_discrete']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict( action_dim=self.a_dim, network_settings=network_settings['critic'])) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self._worker_params_dict.update(self.net._policy_models) self._all_params_dict.update(self.net._all_models) self._all_params_dict.update(optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic) self._model_post_process() def choose_action(self, s, visual_s, evaluation=False): a, _lp, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = a.numpy() self._log_prob = _lp.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): output, cell_state = self.net(s, visual_s, cell_state=cell_state) if self.is_continuous: mu, log_std = output sample_op, _ = gaussian_clip_rsample(mu, log_std) log_prob = gaussian_likelihood_sum(sample_op, mu, log_std) else: logits = output norm_dist = tfp.distributions.Categorical( logits=tf.nn.log_softmax(logits)) sample_op = norm_dist.sample() log_prob = norm_dist.log_prob(sample_op) return sample_op, log_prob, cell_state def store_data(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance( a, np.ndarray), "store_data need action type is np.ndarray" assert isinstance( r, np.ndarray), "store_data need reward type is np.ndarray" assert isinstance( done, np.ndarray), "store_data need done type is np.ndarray" self._running_average(s) old_log_prob = self._log_prob self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_, done[:, np.newaxis], old_log_prob) def no_op_store(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance( a, np.ndarray), "store_data need action type is np.ndarray" assert isinstance( r, np.ndarray), "store_data need reward type is np.ndarray" assert isinstance( done, np.ndarray), "store_data need done type is np.ndarray" self._running_average(s) old_log_prob = np.ones_like(r) self.data.add(s, visual_s, a, r[:, np.newaxis], s_, visual_s_, done[:, np.newaxis], old_log_prob[:, np.newaxis]) def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): self._learn( function_dict={ 'summary_dict': dict([[ 'LEARNING_RATE/actor_lr', self.actor_lr(self.train_step) ], [ 'LEARNING_RATE/critic_lr', self.critic_lr(self.train_step) ]]), 'sample_data_list': [ 's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'old_log_prob' ], 'train_data_list': ['ss', 'vvss', 'a', 'r', 'done', 'old_log_prob'] }) @tf.function(experimental_relax_shapes=True) def _train(self, memories, isw, cell_state): ss, vvss, a, r, done, old_log_prob = memories with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: (feat, feat_), _ = self._representation_net(ss, vvss, cell_state=cell_state, need_split=True) if self.is_continuous: mu, log_std = self.net.policy_net(feat) log_prob = gaussian_likelihood_sum(a, mu, log_std) entropy = gaussian_entropy(log_std) next_mu, _ = self.net.policy_net(feat_) max_q_next = tf.stop_gradient( self.net.value_net(feat_, next_mu)) else: logits = self.net.policy_net(feat) logp_all = tf.nn.log_softmax(logits) log_prob = tf.reduce_sum(tf.multiply(logp_all, a), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) logits = self.net.policy_net(feat_) max_a = tf.argmax(logits, axis=1) max_a_one_hot = tf.one_hot(max_a, self.a_dim) max_q_next = tf.stop_gradient( self.net.value_net(feat_, max_a_one_hot)) q = self.net.value_net(feat, a) ratio = tf.stop_gradient(tf.exp(log_prob - old_log_prob)) q_value = tf.stop_gradient(q) td_error = q - (r + self.gamma * (1 - done) * max_q_next) critic_loss = tf.reduce_mean(tf.square(td_error) * isw) actor_loss = -tf.reduce_mean(ratio * log_prob * q_value) critic_grads = tape.gradient(critic_loss, self.net.critic_trainable_variables) self.optimizer_critic.apply_gradients( zip(critic_grads, self.net.critic_trainable_variables)) actor_grads = tape.gradient(actor_loss, self.net.actor_trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.net.actor_trainable_variables)) self.global_step.assign_add(1) return td_error, dict([['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/q_max', tf.reduce_max(q)], ['Statistics/q_min', tf.reduce_min(q)], ['Statistics/q_mean', tf.reduce_mean(q)], ['Statistics/ratio', tf.reduce_mean(ratio)], ['Statistics/entropy', entropy]])
class A2C(On_Policy): def __init__( self, envspec, epoch=5, beta=1.0e-3, actor_lr=5.0e-4, critic_lr=1.0e-3, condition_sigma: bool = False, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__(envspec=envspec, **kwargs) self.beta = beta self.epoch = epoch if self.is_continuous: self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_MU_LOGSTD, policy_net_kwargs=dict( output_shape=self.a_dim, condition_sigma=condition_sigma, network_settings=network_settings['actor_continuous']), value_net_type=OutputNetworkType.CRITIC_VALUE, value_net_kwargs=dict( network_settings=network_settings['critic'])) else: self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_DCT, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_discrete']), value_net_type=OutputNetworkType.CRITIC_VALUE, value_net_kwargs=dict( network_settings=network_settings['critic'])) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.initialize_data_buffer( sample_data_type=A2C_Train_BatchExperiences) self._worker_params_dict.update(self.net._policy_models) self._all_params_dict.update(self.net._all_models) self._all_params_dict.update(optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic) self._model_post_process() def choose_action(self, obs, evaluation=False): a, self.next_cell_state = self._get_action(obs, self.cell_state) a = a.numpy() return a @tf.function def _get_action(self, obs, cell_state): with tf.device(self.device): output, cell_state = self.net(obs, cell_state=cell_state) if self.is_continuous: mu, log_std = output sample_op, _ = gaussian_clip_rsample(mu, log_std) else: logits = output norm_dist = tfp.distributions.Categorical( logits=tf.nn.log_softmax(logits)) sample_op = norm_dist.sample() return sample_op, cell_state @tf.function def _get_value(self, obs, cell_state): with tf.device(self.device): feat, cell_state = self._representation_net(obs, cell_state=cell_state) value = self.net.value_net(feat) return value, cell_state def calculate_statistics(self): init_value, self.cell_state = self._get_value( self.data.last_data('obs_'), cell_state=self.cell_state) self.data.cal_dc_r(self.gamma, init_value.numpy()) def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _train(data, cell_state): for _ in range(self.epoch): actor_loss, critic_loss, entropy = self.train(data, cell_state) summaries = dict([ ['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/entropy', entropy], ]) return summaries self._learn( function_dict={ 'calculate_statistics': self.calculate_statistics, 'train_function': _train, 'summary_dict': dict([[ 'LEARNING_RATE/actor_lr', self.actor_lr(self.train_step) ], [ 'LEARNING_RATE/critic_lr', self.critic_lr(self.train_step) ]]) }) @tf.function def train(self, BATCH, cell_state): with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat, _ = self._representation_net(BATCH.obs, cell_state=cell_state) if self.is_continuous: mu, log_std = self.net.policy_net(feat) log_act_prob = gaussian_likelihood_sum( BATCH.action, mu, log_std) entropy = gaussian_entropy(log_std) else: logits = self.net.policy_net(feat) logp_all = tf.nn.log_softmax(logits) log_act_prob = tf.reduce_sum(BATCH.action * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) v = self.net.value_net(feat) advantage = tf.stop_gradient(BATCH.discounted_reward - v) td_error = BATCH.discounted_reward - v critic_loss = tf.reduce_mean(tf.square(td_error)) actor_loss = -(tf.reduce_mean(log_act_prob * advantage) + self.beta * entropy) critic_grads = tape.gradient(critic_loss, self.net.critic_trainable_variables) self.optimizer_critic.apply_gradients( zip(critic_grads, self.net.critic_trainable_variables)) if self.is_continuous: actor_grads = tape.gradient(actor_loss, self.net.actor_trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.net.actor_trainable_variables)) else: actor_grads = tape.gradient(actor_loss, self.net.actor_trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.net.actor_trainable_variables)) self.global_step.assign_add(1) return actor_loss, critic_loss, entropy
class DPG(Off_Policy): ''' Deterministic Policy Gradient, https://hal.inria.fr/file/index/docid/938992/filename/dpg-icml2014.pdf ''' # off-policy DPG def __init__(self, envspec, actor_lr=5.0e-4, critic_lr=1.0e-3, use_target_action_noise=False, gaussian_noise_sigma=0.2, gaussian_noise_bound=0.2, discrete_tau=1.0, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__(envspec=envspec, **kwargs) self.discrete_tau = discrete_tau self.use_target_action_noise = use_target_action_noise self.gaussian_noise_sigma = gaussian_noise_sigma self.gaussian_noise_bound = gaussian_noise_bound if self.is_continuous: self.target_noised_action = ClippedNormalNoisedAction( sigma=self.gaussian_noise_sigma, noise_bound=self.gaussian_noise_bound) self.noised_action = OrnsteinUhlenbeckNoisedAction(sigma=0.2) self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_DPG, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_continuous']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict(action_dim=self.a_dim, network_settings=network_settings['q'])) else: self.gumbel_dist = tfp.distributions.Gumbel(0, 1) self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_DCT, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_discrete']), value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE, value_net_kwargs=dict(action_dim=self.a_dim, network_settings=network_settings['q'])) self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self._worker_params_dict.update(self.net._policy_models) self._all_params_dict.update(self.net._all_models) self._all_params_dict.update(optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic) self._model_post_process() def reset(self): super().reset() if self.is_continuous: self.noised_action.reset() def choose_action(self, obs, evaluation=False): mu, pi, self.cell_state = self._get_action(obs, self.cell_state) a = mu.numpy() if evaluation else pi.numpy() return a @tf.function def _get_action(self, obs, cell_state): with tf.device(self.device): output, cell_state = self.net(obs, cell_state=cell_state) if self.is_continuous: mu = output pi = self.noised_action(mu) else: logits = output mu = tf.argmax(logits, axis=1) cate_dist = tfp.distributions.Categorical( logits=tf.nn.log_softmax(logits)) pi = cate_dist.sample() return mu, pi, cell_state def learn(self, **kwargs): self.train_step = kwargs.get('train_step') for i in range(self.train_times_per_step): self._learn( function_dict={ 'summary_dict': dict([[ 'LEARNING_RATE/actor_lr', self.actor_lr(self.train_step) ], [ 'LEARNING_RATE/critic_lr', self.critic_lr(self.train_step) ]]), 'use_stack': True }) @tf.function def _train(self, BATCH, isw, cell_state): with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: (feat, feat_), _ = self._representation_net(BATCH.obs, cell_state=cell_state, need_split=True) if self.is_continuous: action_target = self.ac_target_net.policy_net(feat_) if self.use_target_action_noise: action_target = self.target_noised_action( action_target) mu = self.net.policy_net(feat) else: target_logits = self.net.policy_net(feat_) target_cate_dist = tfp.distributions.Categorical( logits=tf.nn.log_softmax(target_logits)) target_pi = target_cate_dist.sample() target_log_pi = target_cate_dist.log_prob(target_pi) action_target = tf.one_hot(target_pi, self.a_dim, dtype=tf.float32) logits = self.net.policy_net(feat) _pi = tf.nn.softmax(logits) _pi_true_one_hot = tf.one_hot(tf.argmax(logits, axis=-1), self.a_dim, dtype=tf.float32) _pi_diff = tf.stop_gradient(_pi_true_one_hot - _pi) mu = _pi_diff + _pi q_target = self.net.value_net(feat_, action_target) dc_r = tf.stop_gradient(BATCH.reward + self.gamma * q_target * (1 - BATCH.done)) q = self.net.value_net(feat, BATCH.action) td_error = dc_r - q q_loss = 0.5 * tf.reduce_mean(tf.square(td_error) * isw) q_actor = self.net.value_net(feat, mu) actor_loss = -tf.reduce_mean(q_actor) q_grads = tape.gradient(q_loss, self.net.critic_trainable_variables) self.optimizer_critic.apply_gradients( zip(q_grads, self.net.critic_trainable_variables)) actor_grads = tape.gradient(actor_loss, self.net.actor_trainable_variables) self.optimizer_actor.apply_gradients( zip(actor_grads, self.net.actor_trainable_variables)) self.global_step.assign_add(1) return td_error, dict([['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', q_loss], ['Statistics/q_min', tf.reduce_min(q)], ['Statistics/q_mean', tf.reduce_mean(q)], ['Statistics/q_max', tf.reduce_max(q)]])
def __init__( self, envspec, beta=1.0e-3, lr=5.0e-4, delta=0.01, lambda_=0.95, cg_iters=10, train_v_iters=10, damping_coeff=0.1, backtrack_iters=10, backtrack_coeff=0.8, epsilon=0.2, critic_lr=1e-3, condition_sigma: bool = False, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__(envspec=envspec, **kwargs) self.beta = beta self.delta = delta self.lambda_ = lambda_ self.epsilon = epsilon self.cg_iters = cg_iters self.damping_coeff = damping_coeff self.backtrack_iters = backtrack_iters self.backtrack_coeff = backtrack_coeff self.train_v_iters = train_v_iters if self.is_continuous: self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_MU_LOGSTD, policy_net_kwargs=dict( output_shape=self.a_dim, condition_sigma=condition_sigma, network_settings=network_settings['actor_continuous']), value_net_type=OutputNetworkType.CRITIC_VALUE, value_net_kwargs=dict( network_settings=network_settings['critic'])) else: self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_DCT, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_discrete']), value_net_type=OutputNetworkType.CRITIC_VALUE, value_net_kwargs=dict( network_settings=network_settings['critic'])) self.critic_lr = self.init_lr(critic_lr) self.optimizer_critic = self.init_optimizer(self.critic_lr) if self.is_continuous: data_name_list = [ 's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value', 'log_prob', 'old_mu', 'old_log_std' ] else: data_name_list = [ 's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value', 'log_prob', 'old_logp_all' ] self.initialize_data_buffer(data_name_list=data_name_list) self._worker_params_dict.update(self.net._policy_models) self._all_params_dict.update(self.net._all_models) self._all_params_dict.update(optimizer_critic=self.optimizer_critic) self._model_post_process()
class TRPO(On_Policy): ''' Trust Region Policy Optimization, https://arxiv.org/abs/1502.05477 ''' def __init__( self, envspec, beta=1.0e-3, lr=5.0e-4, delta=0.01, lambda_=0.95, cg_iters=10, train_v_iters=10, damping_coeff=0.1, backtrack_iters=10, backtrack_coeff=0.8, epsilon=0.2, critic_lr=1e-3, condition_sigma: bool = False, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__(envspec=envspec, **kwargs) self.beta = beta self.delta = delta self.lambda_ = lambda_ self.epsilon = epsilon self.cg_iters = cg_iters self.damping_coeff = damping_coeff self.backtrack_iters = backtrack_iters self.backtrack_coeff = backtrack_coeff self.train_v_iters = train_v_iters if self.is_continuous: self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_MU_LOGSTD, policy_net_kwargs=dict( output_shape=self.a_dim, condition_sigma=condition_sigma, network_settings=network_settings['actor_continuous']), value_net_type=OutputNetworkType.CRITIC_VALUE, value_net_kwargs=dict( network_settings=network_settings['critic'])) else: self.net = ACNetwork( name='net', representation_net=self._representation_net, policy_net_type=OutputNetworkType.ACTOR_DCT, policy_net_kwargs=dict( output_shape=self.a_dim, network_settings=network_settings['actor_discrete']), value_net_type=OutputNetworkType.CRITIC_VALUE, value_net_kwargs=dict( network_settings=network_settings['critic'])) self.critic_lr = self.init_lr(critic_lr) self.optimizer_critic = self.init_optimizer(self.critic_lr) if self.is_continuous: data_name_list = [ 's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value', 'log_prob', 'old_mu', 'old_log_std' ] else: data_name_list = [ 's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value', 'log_prob', 'old_logp_all' ] self.initialize_data_buffer(data_name_list=data_name_list) self._worker_params_dict.update(self.net._policy_models) self._all_params_dict.update(self.net._all_models) self._all_params_dict.update(optimizer_critic=self.optimizer_critic) self._model_post_process() def choose_action(self, s, visual_s, evaluation=False): a, _v, _lp, _morlpa, self.next_cell_state = self._get_action( s, visual_s, self.cell_state) a = a.numpy() self._value = np.squeeze(_v.numpy()) self._log_prob = np.squeeze(_lp.numpy()) + 1e-10 if self.is_continuous: self._mu = _morlpa[0].numpy() self._log_std = _morlpa[1].numpy() else: self._logp_all = _morlpa.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self._representation_net(s, visual_s, cell_state=cell_state) value = self.net.value_net(feat) output = self.net.policy_net(feat) if self.is_continuous: mu, log_std = output sample_op, _ = gaussian_clip_rsample(mu, log_std) log_prob = gaussian_likelihood_sum(sample_op, mu, log_std) return sample_op, value, log_prob, (mu, log_std), cell_state else: logits = output logp_all = tf.nn.log_softmax(logits) norm_dist = tfp.distributions.Categorical(logits=logp_all) sample_op = norm_dist.sample() log_prob = norm_dist.log_prob(sample_op) return sample_op, value, log_prob, logp_all, cell_state def store_data(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance( a, np.ndarray), "store_data need action type is np.ndarray" assert isinstance( r, np.ndarray), "store_data need reward type is np.ndarray" assert isinstance( done, np.ndarray), "store_data need done type is np.ndarray" self._running_average(s) if self.is_continuous: data = (s, visual_s, a, r, s_, visual_s_, done, self._value, self._log_prob, self._mu, self._log_std) else: data = (s, visual_s, a, r, s_, visual_s_, done, self._value, self._log_prob, self._logp_all) if self.use_rnn: data += tuple(cs.numpy() for cs in self.cell_state) self.data.add(*data) self.cell_state = self.next_cell_state @tf.function def _get_value(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self._representation_net(s, visual_s, cell_state=cell_state) value = self.net.value_net(feat) return value, cell_state def calculate_statistics(self): init_value, self.cell_state = self._get_value( self.data.last_s(), self.data.last_visual_s(), cell_state=self.cell_state) init_value = np.squeeze(init_value.numpy()) self.data.cal_dc_r(self.gamma, init_value) self.data.cal_td_error(self.gamma, init_value) self.data.cal_gae_adv(self.lambda_, self.gamma) def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _train(data): if self.is_continuous: s, visual_s, a, dc_r, old_log_prob, advantage, old_mu, old_log_std, cell_state = data Hx_args = (s, visual_s, old_mu, old_log_std, cell_state) else: s, visual_s, a, dc_r, old_log_prob, advantage, old_logp_all, cell_state = data Hx_args = (s, visual_s, old_logp_all, cell_state) actor_loss, entropy, gradients = self.train_actor( (s, visual_s, a, old_log_prob, advantage, cell_state)) x = self.cg(self.Hx, gradients.numpy(), Hx_args) x = tf.convert_to_tensor(x) alpha = np.sqrt(2 * self.delta / (np.dot(x, self.Hx(x, *Hx_args)) + 1e-8)) for i in range(self.backtrack_iters): assign_params_from_flat(alpha * x * (self.backtrack_coeff**i), self.net.actor_trainable_variables) for _ in range(self.train_v_iters): critic_loss = self.train_critic( (s, visual_s, dc_r, cell_state)) summaries = dict([['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/entropy', entropy]]) return summaries if self.is_continuous: train_data_list = [ 's', 'visual_s', 'a', 'discounted_reward', 'log_prob', 'gae_adv', 'old_mu', 'old_log_std' ] else: train_data_list = [ 's', 'visual_s', 'a', 'discounted_reward', 'log_prob', 'gae_adv', 'old_logp_all' ] self._learn( function_dict={ 'calculate_statistics': self.calculate_statistics, 'train_function': _train, 'train_data_list': train_data_list, 'summary_dict': dict([[ 'LEARNING_RATE/critic_lr', self.critic_lr(self.train_step) ]]) }) @tf.function(experimental_relax_shapes=True) def train_actor(self, memories): s, visual_s, a, old_log_prob, advantage, cell_state = memories with tf.device(self.device): with tf.GradientTape() as tape: output, _ = self.net(s, visual_s, cell_state=cell_state) if self.is_continuous: mu, log_std = output new_log_prob = gaussian_likelihood_sum(a, mu, log_std) entropy = gaussian_entropy(log_std) else: logits = output logp_all = tf.nn.log_softmax(logits) new_log_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) ratio = tf.exp(new_log_prob - old_log_prob) actor_loss = -tf.reduce_mean(ratio * advantage) actor_grads = tape.gradient(actor_loss, self.net.actor_trainable_variables) gradients = flat_concat(actor_grads) self.global_step.assign_add(1) return actor_loss, entropy, gradients @tf.function(experimental_relax_shapes=True) def Hx(self, x, *args): if self.is_continuous: s, visual_s, old_mu, old_log_std, cell_state = args else: s, visual_s, old_logp_all, cell_state = args with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: output, _ = self.net(s, visual_s, cell_state=cell_state) if self.is_continuous: mu, log_std = output var0, var1 = tf.exp(2 * log_std), tf.exp(2 * old_log_std) pre_sum = 0.5 * (((old_mu - mu)**2 + var0) / (var1 + 1e-8) - 1) + old_log_std - log_std all_kls = tf.reduce_sum(pre_sum, axis=1) kl = tf.reduce_mean(all_kls) else: logits = output logp_all = tf.nn.log_softmax(logits) all_kls = tf.reduce_sum(tf.exp(old_logp_all) * (old_logp_all - logp_all), axis=1) kl = tf.reduce_mean(all_kls) g = flat_concat( tape.gradient(kl, self.net.actor_trainable_variables)) _g = tf.reduce_sum(g * x) hvp = flat_concat( tape.gradient(_g, self.net.actor_trainable_variables)) if self.damping_coeff > 0: hvp += self.damping_coeff * x return hvp @tf.function(experimental_relax_shapes=True) def train_critic(self, memories): s, visual_s, dc_r, cell_state = memories with tf.device(self.device): with tf.GradientTape() as tape: feat, _ = self._representation_net(s, visual_s, cell_state=cell_state) value = self.net.value_net(feat) td_error = dc_r - value value_loss = tf.reduce_mean(tf.square(td_error)) critic_grads = tape.gradient(value_loss, self.net.critic_trainable_variables) self.optimizer_critic.apply_gradients( zip(critic_grads, self.net.critic_trainable_variables)) return value_loss def cg(self, Ax, b, args): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ x = np.zeros_like(b) r = b.copy( ) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = r.copy() r_dot_old = np.dot(r, r) for _ in range(self.cg_iters): z = Ax(tf.convert_to_tensor(p), *args) alpha = r_dot_old / (np.dot(p, z) + 1e-8) x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x