class PG(make_on_policy_class(mode='share')): def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, lr=5.0e-4, epoch=5, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.epoch = epoch # self.TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1]) if self.is_continuous: self.net = rls.actor_mu(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) self.log_std = tf.Variable(initial_value=-0.5 * np.ones(self.a_dim, dtype=np.float32), trainable=True) self.net_tv = self.net.trainable_variables + [self.log_std ] + self.other_tv else: self.net = rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.net_tv = self.net.trainable_variables + self.other_tv self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict(model=self.net, optimizer=self.optimizer)) self.initialize_data_buffer() def show_logo(self): self.recorder.logger.info(''' xxxxxxxx xxxxxx xx xx xxx xx x xxx xx x x xxx xx xxxxxx x xxxxx x xx xxx x xx x x xxx xx xxxxx xxxxxx xx ''') def choose_action(self, s, visual_s, evaluation=False): a, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = a.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) if self.is_continuous: mu = self.net(feat) sample_op, _ = gaussian_clip_rsample(mu, self.log_std) else: logits = self.net(feat) norm_dist = tfp.distributions.Categorical(logits) sample_op = norm_dist.sample() return sample_op, cell_state def calculate_statistics(self): self.data.cal_dc_r(self.gamma, 0., normalize=True) def learn(self, **kwargs): self.episode = kwargs['episode'] def _train(data, crsty_loss, cell_state): for _ in range(self.epoch): loss, entropy = self.train(data, crsty_loss, cell_state) summaries = dict([['LOSS/loss', loss], ['Statistics/entropy', entropy]]) return summaries self._learn( function_dict={ 'calculate_statistics': self.calculate_statistics, 'train_function': _train, 'train_data_list': ['s', 'visual_s', 'a', 'discounted_reward'], 'summary_dict': dict( [['LEARNING_RATE/lr', self.lr(self.episode)]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, crsty_loss, cell_state): s, visual_s, a, dc_r = memories with tf.device(self.device): with tf.GradientTape() as tape: feat = self.get_feature(s, visual_s, cell_state=cell_state) if self.is_continuous: mu = self.net(feat) log_act_prob = gaussian_likelihood_sum(a, mu, self.log_std) entropy = gaussian_entropy(self.log_std) else: logits = self.net(feat) logp_all = tf.nn.log_softmax(logits) log_act_prob = tf.reduce_sum(tf.multiply(logp_all, a), axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) loss = -tf.reduce_mean(log_act_prob * dc_r) + crsty_loss loss_grads = tape.gradient(loss, self.net_tv) self.optimizer.apply_gradients(zip(loss_grads, self.net_tv)) self.global_step.assign_add(1) return loss, entropy
class CEM(make_on_policy_class(mode='share')): ''' Cross-Entropy Method ''' def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, hidden_units=[32, 32], frac=0.2, init_var=1, extra_std=1, extra_decay_eps=200, extra_var_last_multiplier=0.2, envs_per_popu=5, # 环境数/模型数 余数为0 **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.frac = frac self.hidden_units = hidden_units self.init_var = init_var self.extra_std = extra_std self.extra_decay_eps = extra_decay_eps self.envs_per_popu = envs_per_popu self.extra_var_last_multiplier = extra_var_last_multiplier def show_logo(self): self.recorder.logger.info(''' xxxxxxx xxxxxxxx xxxx xxxx xxxx xxx xxx xx xxx xxx xxxx x xxx x xxxx xxxx xxx x xxx x xxxx xxxx xxx xxxxxx x xx xxxxx xxx xxx x x xxxxxxxx xxx xxx x x x xxx xxx xxx x xxx xx x xxx xxx xxxxxxxx xxxxxxxx xxxxxx xxxxx xxxxx ''') def choose_action(self, s, visual_s, evaluation=False): self._check_agents(s) a = [ model(s_).numpy() for model, s_ in zip( self.cem_models, np.split(s, self.populations, axis=0)) ] if self.is_continuous: a = np.vstack(a) else: a = np.hstack(a) return a @tf.function def _get_action(self, s, visual_s): s, visual_s = self.cast(s, visual_s) with tf.device(self.device): pass def store_data(self, s, visual_s, a, r, s_, visual_s_, done): self.returns += r * (1 - self.dones) self.dones += done pass def learn(self, **kwargs): self.train_step = kwargs.get('train_step') rets = self.returns.reshape(-1, self.envs_per_popu).mean(axis=-1) elites_idxs = rets.argsort()[-self.n_elite:] elites_weights = np.array(self.models_weights)[elites_idxs, :] self.mu = np.mean(elites_weights, axis=0) self.sigma = np.var(elites_weights, axis=0) self._update_models_weights() self._reset_variables() self.write_training_summaries( self.train_step, dict([['Statistics/mu', self.mu.mean()], ['Statistics/sigma', self.sigma.mean()], ['Statistics/sample_std', self.sample_std.mean()]])) def _check_agents(self, s): ''' 用于为实例赋予种群数量属性,并且初始化变量 params : 状态列表S,一个环境下有多少个智能体就包含多少个状态向量 ''' if not hasattr(self, 'populations'): assert s.shape[ 0] % self.envs_per_popu == 0, '环境数必须可以整除envs_per_popu系数' self.populations = int(s.shape[0] / self.envs_per_popu) self._build() def _build(self): ''' 构建实体模型,初始化变量 ''' self.n_elite = max(int(np.round(self.populations * self.frac)), 1) self.cem_models = [ Model(self.s_dim, self.a_dim, self.hidden_units, self.is_continuous) for i in range(self.populations) ] self.mu = np.random.randn(self.cem_models[0].weights_total_nums) self.sigma = np.ones( self.cem_models[0].weights_total_nums) * self.init_var self._update_models_weights() self._reset_variables() def _reset_variables(self): ''' 初始化return列表和done标志列表 ''' self.returns = np.zeros(self.populations * self.envs_per_popu, dtype=np.float32) self.dones = np.full(self.populations * self.envs_per_popu, False) def _update_models_weights(self): ''' 重新给模型赋参数 ''' extra_var_multiplier = max( (1.0 - self.train_step / self.extra_decay_eps), self.extra_var_last_multiplier) self.sample_std = np.sqrt(self.sigma + np.square(self.extra_std) * extra_var_multiplier) self.models_weights = [ self.mu + self.sample_std * np.random.randn(self.mu.shape[0]) for i in range(self.populations) ] [m.set_wb(wb) for m, wb in zip(self.cem_models, self.models_weights)]
class AOC(make_on_policy_class(mode='share')): ''' Asynchronous Advantage Option-Critic with Deliberation Cost, A2OC When Waiting is not an Option : Learning Options with a Deliberation Cost, A2OC, http://arxiv.org/abs/1709.04571 ''' def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, options_num=4, dc=0.01, terminal_mask=False, eps=0.1, epoch=4, pi_beta=1.0e-3, lr=5.0e-4, lambda_=0.95, epsilon=0.2, value_epsilon=0.2, kl_reverse=False, kl_target=0.02, kl_target_cutoff=2, kl_target_earlystop=4, kl_beta=[0.7, 1.3], kl_alpha=1.5, kl_coef=1.0, hidden_units={ 'share': [32, 32], 'q': [32, 32], 'intra_option': [32, 32], 'termination': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.pi_beta = pi_beta self.epoch = epoch self.lambda_ = lambda_ self.epsilon = epsilon self.value_epsilon = value_epsilon self.kl_reverse = kl_reverse self.kl_target = kl_target self.kl_alpha = kl_alpha self.kl_coef = tf.constant(kl_coef, dtype=tf.float32) self.kl_cutoff = kl_target * kl_target_cutoff self.kl_stop = kl_target * kl_target_earlystop self.kl_low = kl_target * kl_beta[0] self.kl_high = kl_target * kl_beta[-1] self.options_num = options_num self.dc = dc self.terminal_mask = terminal_mask self.eps = eps self.net = rls.aoc_share(self.feat_dim, self.a_dim, self.options_num, hidden_units, self.is_continuous) if self.is_continuous: self.log_std = tf.Variable(initial_value=-0.5 * np.ones( (self.options_num, self.a_dim), dtype=np.float32), trainable=True) # [P, A] self.net_tv = self.net.trainable_variables + [self.log_std ] + self.other_tv else: self.net_tv = self.net.trainable_variables + self.other_tv self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict(model=self.net, optimizer=self.optimizer)) self.initialize_data_buffer(data_name_list=[ 's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value', 'log_prob', 'beta_adv', 'last_options', 'options' ]) def show_logo(self): self.recorder.logger.info(''' xx xxxxx xxxxx xxxxxx xxx xxx xxx xx xxx xxx xx xxx xx xx xx xxx xx xx x xx xx xxx xx xx xx xx xxx xxx xxx xxx xxxxxx xx xxx xx xxx xx xx xx xx xx xx xx xx xx xx xxx xx x xxx xxx xxx xxxxx xxxxx xxxxxx xxxxxx ''') def reset(self): super().reset() self._done_mask = np.full(self.n_agents, True) def partial_reset(self, done): super().partial_reset(done) self._done_mask = done def _generate_random_options(self): return tf.constant(np.random.randint(0, self.options_num, self.n_agents), dtype=tf.int32) def choose_action(self, s, visual_s, evaluation=False): if not hasattr(self, 'options'): self.options = self._generate_random_options() self.last_options = self.options if not hasattr(self, 'oc_mask'): self.oc_mask = tf.constant(np.zeros(self.n_agents), dtype=tf.int32) a, value, log_prob, beta_adv, new_options, max_options, self.cell_state = self._get_action( s, visual_s, self.cell_state, self.options) a = a.numpy() new_options = tf.where(self._done_mask, max_options, new_options) self._done_mask = np.full(self.n_agents, False) self._value = np.squeeze(value.numpy()) self._log_prob = np.squeeze(log_prob.numpy()) + 1e-10 self._beta_adv = np.squeeze(beta_adv.numpy()) + self.dc self.oc_mask = ( new_options == self.options).numpy() # equal means no change self.options = new_options return a @tf.function def _get_action(self, s, visual_s, cell_state, options): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) q, pi, beta = self.net(feat) # [B, P], [B, P, A], [B, P], [B, P] options_onehot = tf.one_hot(options, self.options_num, dtype=tf.float32) # [B, P] options_onehot_expanded = tf.expand_dims(options_onehot, axis=-1) # [B, P, 1] pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1) # [B, A] if self.is_continuous: log_std = tf.gather(self.log_std, options) mu = pi sample_op, _ = gaussian_clip_rsample(mu, log_std) log_prob = gaussian_likelihood_sum(sample_op, mu, log_std) else: logits = pi norm_dist = tfp.distributions.Categorical(logits) sample_op = norm_dist.sample() log_prob = norm_dist.log_prob(sample_op) q_o = tf.reduce_sum(q * options_onehot, axis=-1) # [B, ] beta_adv = q_o - ((1 - self.eps) * tf.reduce_max(q, axis=-1) + self.eps * tf.reduce_mean(q, axis=-1)) # [B, ] max_options = tf.cast(tf.argmax(q, axis=-1), dtype=tf.int32) # [B, P] => [B, ] beta_probs = tf.reduce_sum(beta * options_onehot, axis=1) # [B, P] => [B,] beta_dist = tfp.distributions.Bernoulli(probs=beta_probs) new_options = tf.where(beta_dist.sample() < 1, options, max_options) # <1 则不改变op, =1 则改变op return sample_op, q_o, log_prob, beta_adv, new_options, max_options, cell_state def store_data(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance( a, np.ndarray), "store_data need action type is np.ndarray" assert isinstance( r, np.ndarray), "store_data need reward type is np.ndarray" assert isinstance( done, np.ndarray), "store_data need done type is np.ndarray" self._running_average(s) r -= (1 - self.oc_mask) * self.dc self.data.add(s, visual_s, a, r, s_, visual_s_, done, self._value, self._log_prob, self._beta_adv, self.last_options, self.options) self.oc_mask = tf.zeros_like(self.oc_mask) @tf.function def _get_value(self, feat, options): options = tf.cast(options, tf.int32) with tf.device(self.device): options_onehot = tf.one_hot(options, self.options_num, dtype=tf.float32) # [B, P] q, _, _ = self.net(feat) q_o = tf.reduce_sum(q * options_onehot, axis=-1) # [B, ] return q_o def calculate_statistics(self): feat, self.cell_state = self.get_feature(self.data.last_s(), self.data.last_visual_s(), cell_state=self.cell_state, record_cs=True) init_value = np.squeeze( self._get_value(feat, self.data.buffer['options'][-1]).numpy()) self.data.cal_dc_r(self.gamma, init_value) self.data.cal_td_error(self.gamma, init_value) self.data.cal_gae_adv(self.lambda_, self.gamma) def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _train(data, crsty_loss, cell_state): early_step = 0 for i in range(self.epoch): loss, pi_loss, q_loss, beta_loss, entropy, kl = self.train_share( data, self.kl_coef, crsty_loss, cell_state) if kl > self.kl_stop: early_step = i break if kl > self.kl_high: self.kl_coef *= self.kl_alpha elif kl < self.kl_low: self.kl_coef /= self.kl_alpha summaries = dict([ ['LOSS/loss', loss], ['LOSS/loss', pi_loss], ['LOSS/loss', q_loss], ['LOSS/loss', beta_loss], ['Statistics/kl', kl], ['Statistics/entropy', entropy], ['Statistics/kl_coef', self.kl_coef], ['Statistics/early_step', early_step], ]) return summaries summary_dict = dict([['LEARNING_RATE/lr', self.lr(self.train_step)]]) self._learn( function_dict={ 'calculate_statistics': self.calculate_statistics, 'train_function': _train, 'train_data_list': [ 's', 'visual_s', 'a', 'discounted_reward', 'log_prob', 'gae_adv', 'value', 'beta_adv', 'last_options', 'options' ], 'summary_dict': summary_dict }) @tf.function(experimental_relax_shapes=True) def train_share(self, memories, kl_coef, crsty_loss, cell_state): s, visual_s, a, dc_r, old_log_prob, advantage, old_value, beta_advantage, last_options, options = memories last_options = tf.reshape(tf.cast(last_options, tf.int32), (-1, )) # [B, 1] => [B,] options = tf.reshape(tf.cast(options, tf.int32), (-1, )) with tf.device(self.device): with tf.GradientTape() as tape: feat = self.get_feature(s, visual_s, cell_state=cell_state) q, pi, beta = self.net( feat) # [B, P], [B, P, A], [B, P], [B, P] options_onehot = tf.one_hot(options, self.options_num, dtype=tf.float32) # [B, P] options_onehot_expanded = tf.expand_dims(options_onehot, axis=-1) # [B, P, 1] last_options_onehot = tf.one_hot( last_options, self.options_num, dtype=tf.float32) # [B,] => [B, P] pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1) # [B, P, A] => [B, A] value = tf.reduce_sum(q * options_onehot, axis=1, keepdims=True) # [B, 1] if self.is_continuous: log_std = tf.gather(self.log_std, options) mu = pi # [B, A] new_log_prob = gaussian_likelihood_sum(a, mu, log_std) entropy = gaussian_entropy(log_std) else: logits = pi # [B, A] logp_all = tf.nn.log_softmax(logits) new_log_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) ratio = tf.exp(new_log_prob - old_log_prob) if self.kl_reverse: kl = tf.reduce_mean(new_log_prob - old_log_prob) else: kl = tf.reduce_mean( old_log_prob - new_log_prob ) # a sample estimate for KL-divergence, easy to compute surrogate = ratio * advantage value_clip = old_value + tf.clip_by_value( value - old_value, -self.value_epsilon, self.value_epsilon) td_error = dc_r - value td_error_clip = dc_r - value_clip td_square = tf.maximum(tf.square(td_error), tf.square(td_error_clip)) pi_loss = -tf.reduce_mean( tf.minimum( surrogate, tf.clip_by_value(ratio, 1.0 - self.epsilon, 1.0 + self.epsilon) * advantage)) kl_loss = kl_coef * kl extra_loss = 1000.0 * tf.square( tf.maximum(0., kl - self.kl_cutoff)) pi_loss = pi_loss + kl_loss + extra_loss q_loss = 0.5 * tf.reduce_mean(td_square) beta_s = tf.reduce_sum(beta * last_options_onehot, axis=-1, keepdims=True) # [B, 1] beta_loss = tf.reduce_mean(beta_s * beta_advantage) if self.terminal_mask: beta_loss *= (1 - done) loss = pi_loss + 1.0 * q_loss + beta_loss - self.pi_beta * entropy + crsty_loss loss_grads = tape.gradient(loss, self.net_tv) self.optimizer.apply_gradients(zip(loss_grads, self.net_tv)) self.global_step.assign_add(1) return loss, pi_loss, q_loss, beta_loss, entropy, kl
class A2C(make_on_policy_class(mode='share')): def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, epoch=5, beta=1.0e-3, actor_lr=5.0e-4, critic_lr=1.0e-3, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.beta = beta self.epoch = epoch # self.TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1]) if self.is_continuous: self.actor_net = rls.actor_mu(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) self.log_std = tf.Variable(initial_value=-0.5 * np.ones(self.a_dim, dtype=np.float32), trainable=True) self.actor_tv = self.actor_net.trainable_variables + [self.log_std] else: self.actor_net = rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.actor_tv = self.actor_net.trainable_variables self.critic_net = rls.critic_v(self.feat_dim, hidden_units['critic']) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map( self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder( dict(actor=self.actor_net, critic=self.critic_net, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic)) self.initialize_data_buffer() def show_logo(self): self.recorder.logger.info(''' xx xxxxx xxxxxx xxx xx xxx xxx xx xxx xx xxx xx xx x xx xx xx xx xx xxx xxx xxxxxx xx xxx xx xx xx xx xx xx xx xx x xxx xxx xxx xxxxx xxxxxx xxxxxx ''') def choose_action(self, s, visual_s, evaluation=False): a, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = a.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) if self.is_continuous: mu = self.actor_net(feat) sample_op, _ = gaussian_clip_rsample(mu, self.log_std) else: logits = self.actor_net(feat) norm_dist = tfp.distributions.Categorical(logits) sample_op = norm_dist.sample() return sample_op, cell_state @tf.function def _get_value(self, feat): with tf.device(self.device): value = self.critic_net(feat) return value def calculate_statistics(self): feat, self.cell_state = self.get_feature(self.data.last_s(), self.data.last_visual_s(), cell_state=self.cell_state, record_cs=True) init_value = np.squeeze(self._get_value(feat).numpy()) self.data.cal_dc_r(self.gamma, init_value) def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _train(data, crsty_loss, cell_state): for _ in range(self.epoch): actor_loss, critic_loss, entropy = self.train( data, crsty_loss, cell_state) summaries = dict([ ['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/entropy', entropy], ]) return summaries self._learn( function_dict={ 'calculate_statistics': self.calculate_statistics, 'train_function': _train, 'train_data_list': ['s', 'visual_s', 'a', 'discounted_reward'], 'summary_dict': dict([[ 'LEARNING_RATE/actor_lr', self.actor_lr(self.train_step) ], [ 'LEARNING_RATE/critic_lr', self.critic_lr(self.train_step) ]]) }) @tf.function(experimental_relax_shapes=True) def train(self, memories, crsty_loss, cell_state): s, visual_s, a, dc_r = memories with tf.device(self.device): with tf.GradientTape() as tape: feat = self.get_feature(s, visual_s, cell_state=cell_state) v = self.critic_net(feat) td_error = dc_r - v critic_loss = tf.reduce_mean(tf.square(td_error)) + crsty_loss critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) with tf.GradientTape() as tape: if self.is_continuous: mu = self.actor_net(feat) log_act_prob = gaussian_likelihood_sum(a, mu, self.log_std) entropy = gaussian_entropy(self.log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) log_act_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) v = self.critic_net(feat) advantage = tf.stop_gradient(dc_r - v) actor_loss = -(tf.reduce_mean(log_act_prob * advantage) + self.beta * entropy) if self.is_continuous: actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) else: actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) self.global_step.assign_add(1) return actor_loss, critic_loss, entropy @tf.function(experimental_relax_shapes=True) def train_persistent(self, memories, crsty_loss, cell_state): s, visual_s, a, dc_r = memories with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat = self.get_feature(s, visual_s, cell_state=cell_state) if self.is_continuous: mu = self.actor_net(feat) log_act_prob = gaussian_likelihood_sum(a, mu, self.log_std) entropy = gaussian_entropy(self.log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) log_act_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) v = self.critic_net(feat) advantage = tf.stop_gradient(dc_r - v) td_error = dc_r - v critic_loss = tf.reduce_mean(tf.square(td_error)) + crsty_loss actor_loss = -(tf.reduce_mean(log_act_prob * advantage) + self.beta * entropy) critic_grads = tape.gradient(critic_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) if self.is_continuous: actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) else: actor_grads = tape.gradient(actor_loss, self.actor_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_tv)) self.global_step.assign_add(1) return actor_loss, critic_loss, entropy
class TRPO(make_on_policy_class(mode='share')): ''' Trust Region Policy Optimization, https://arxiv.org/abs/1502.05477 ''' def __init__( self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, beta=1.0e-3, lr=5.0e-4, delta=0.01, lambda_=0.95, cg_iters=10, train_v_iters=10, damping_coeff=0.1, backtrack_iters=10, backtrack_coeff=0.8, epsilon=0.2, critic_lr=1e-3, hidden_units={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__(s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.beta = beta self.delta = delta self.lambda_ = lambda_ self.epsilon = epsilon self.cg_iters = cg_iters self.damping_coeff = damping_coeff self.backtrack_iters = backtrack_iters self.backtrack_coeff = backtrack_coeff self.train_v_iters = train_v_iters # self.actor_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1], [1]) # self.critic_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [1]) if self.is_continuous: self.actor_net = rls.actor_mu(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) self.log_std = tf.Variable(initial_value=-0.5 * np.ones(self.a_dim, dtype=np.float32), trainable=True) self.actor_tv = self.actor_net.trainable_variables + [self.log_std] # self.Hx_TensorSpecs = [tf.TensorSpec(shape=flat_concat(self.actor_tv).shape, dtype=tf.float32)] \ # + get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [self.a_dim]) else: self.actor_net = rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.actor_tv = self.actor_net.trainable_variables # self.Hx_TensorSpecs = [tf.TensorSpec(shape=flat_concat(self.actor_tv).shape, dtype=tf.float32)] \ # + get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim]) self.critic_net = rls.critic_v(self.feat_dim, hidden_units['critic']) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.critic_lr = self.init_lr(critic_lr) self.optimizer_critic = self.init_optimizer(self.critic_lr) self.model_recorder( dict(actor=self.actor_net, critic=self.critic_net, optimizer_critic=self.optimizer_critic)) if self.is_continuous: data_name_list = [ 's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value', 'log_prob', 'old_mu', 'old_log_std' ] else: data_name_list = [ 's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value', 'log_prob', 'old_logp_all' ] self.initialize_data_buffer(data_name_list=data_name_list) def show_logo(self): self.recorder.logger.info(''' xxxxxxxxx xxxxxxxx xxxxxxxx xxxxx xx x xx xx xxx xx xx xxx xxx xx x xx x xxx x xxx xx xx x x xx x xxx xx xxx x xxxxxx xxxxxx xxx xxx x xx xxx x xx xxx x x xx x xx xx x x xxx x xx xxx xxxxx xxxxx xxx xxxxx xxxxx ''') def choose_action(self, s, visual_s, evaluation=False): a, _v, _lp, _morlpa, self.cell_state = self._get_action( s, visual_s, self.cell_state) a = a.numpy() self._value = np.squeeze(_v.numpy()) self._log_prob = np.squeeze(_lp.numpy()) + 1e-10 if self.is_continuous: self._mu = _morlpa.numpy() else: self._logp_all = _morlpa.numpy() return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) value = self.critic_net(feat) if self.is_continuous: mu = self.actor_net(feat) sample_op, _ = gaussian_clip_rsample(mu, self.log_std) log_prob = gaussian_likelihood_sum(sample_op, mu, self.log_std) return sample_op, value, log_prob, mu, cell_state else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) norm_dist = tfp.distributions.Categorical(logits) sample_op = norm_dist.sample() log_prob = norm_dist.log_prob(sample_op) return sample_op, value, log_prob, logp_all, cell_state def store_data(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance( a, np.ndarray), "store_data need action type is np.ndarray" assert isinstance( r, np.ndarray), "store_data need reward type is np.ndarray" assert isinstance( done, np.ndarray), "store_data need done type is np.ndarray" self._running_average(s) if self.is_continuous: self.data.add(s, visual_s, a, r, s_, visual_s_, done, self._value, self._log_prob, self._mu, self.log_std.numpy()) else: self.data.add(s, visual_s, a, r, s_, visual_s_, done, self._value, self._log_prob, self._logp_all) @tf.function def _get_value(self, feat): with tf.device(self.device): value = self.critic_net(feat) return value def calculate_statistics(self): feat, self.cell_state = self.get_feature(self.data.last_s(), self.data.last_visual_s(), cell_state=self.cell_state, record_cs=True) init_value = np.squeeze(self._get_value(feat).numpy()) self.data.cal_dc_r(self.gamma, init_value) self.data.cal_td_error(self.gamma, init_value) self.data.cal_gae_adv(self.lambda_, self.gamma) def learn(self, **kwargs): self.train_step = kwargs.get('train_step') def _train(data, crsty_loss, cell_state): if self.is_continuous: s, visual_s, a, dc_r, old_log_prob, advantage, old_mu, old_log_std = data Hx_args = (s, visual_s, old_mu, old_log_std) else: s, visual_s, a, dc_r, old_log_prob, advantage, old_logp_all = data Hx_args = (s, visual_s, old_logp_all) actor_loss, entropy, gradients = self.train_actor( (s, visual_s, a, old_log_prob, advantage), cell_state) x = self.cg(self.Hx, gradients.numpy(), Hx_args) x = tf.convert_to_tensor(x) alpha = np.sqrt(2 * self.delta / (np.dot(x, self.Hx(x, *Hx_args)) + 1e-8)) for i in range(self.backtrack_iters): assign_params_from_flat(alpha * x * (self.backtrack_coeff**i), self.actor_tv) for _ in range(self.train_v_iters): critic_loss = self.train_critic((s, visual_s, dc_r), crsty_loss, cell_state) summaries = dict([['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/entropy', entropy]]) return summaries if self.is_continuous: train_data_list = [ 's', 'visual_s', 'a', 'discounted_reward', 'log_prob', 'gae_adv', 'old_mu', 'old_log_std' ] else: train_data_list = [ 's', 'visual_s', 'a', 'discounted_reward', 'log_prob', 'gae_adv', 'old_logp_all' ] self._learn( function_dict={ 'calculate_statistics': self.calculate_statistics, 'train_function': _train, 'train_data_list': train_data_list, 'summary_dict': dict([[ 'LEARNING_RATE/critic_lr', self.critic_lr(self.train_step) ]]) }) @tf.function(experimental_relax_shapes=True) def train_actor(self, memories, cell_state): s, visual_s, a, old_log_prob, advantage = memories with tf.device(self.device): feat = self.get_feature(s, visual_s, cell_state=cell_state) with tf.GradientTape() as tape: if self.is_continuous: mu = self.actor_net(feat) new_log_prob = gaussian_likelihood_sum(a, mu, self.log_std) entropy = gaussian_entropy(self.log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) new_log_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean( tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) ratio = tf.exp(new_log_prob - old_log_prob) actor_loss = -tf.reduce_mean(ratio * advantage) actor_grads = tape.gradient(actor_loss, self.actor_tv) gradients = flat_concat(actor_grads) self.global_step.assign_add(1) return actor_loss, entropy, gradients @tf.function(experimental_relax_shapes=True) def Hx(self, x, *args): if self.is_continuous: s, visual_s, old_mu, old_log_std = args else: s, visual_s, old_logp_all = args with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: feat = self.get_feature(s, visual_s) if self.is_continuous: mu = self.actor_net(feat) var0, var1 = tf.exp(2 * self.log_std), tf.exp(2 * old_log_std) pre_sum = 0.5 * ( ((old_mu - mu)**2 + var0) / (var1 + 1e-8) - 1) + old_log_std - self.log_std all_kls = tf.reduce_sum(pre_sum, axis=1) kl = tf.reduce_mean(all_kls) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) all_kls = tf.reduce_sum(tf.exp(old_logp_all) * (old_logp_all - logp_all), axis=1) kl = tf.reduce_mean(all_kls) g = flat_concat(tape.gradient(kl, self.actor_tv)) _g = tf.reduce_sum(g * x) hvp = flat_concat(tape.gradient(_g, self.actor_tv)) if self.damping_coeff > 0: hvp += self.damping_coeff * x return hvp @tf.function(experimental_relax_shapes=True) def train_critic(self, memories, crsty_loss, cell_state): s, visual_s, dc_r = memories with tf.device(self.device): with tf.GradientTape() as tape: feat = self.get_feature(s, visual_s, cell_state=cell_state) value = self.critic_net(feat) td_error = dc_r - value value_loss = tf.reduce_mean(tf.square(td_error)) + crsty_loss critic_grads = tape.gradient(value_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv)) return value_loss def cg(self, Ax, b, args): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ x = np.zeros_like(b) r = b.copy( ) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = r.copy() r_dot_old = np.dot(r, r) for _ in range(self.cg_iters): z = Ax(tf.convert_to_tensor(p), *args) alpha = r_dot_old / (np.dot(p, z) + 1e-8) x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x
class PPO(make_on_policy_class(mode='share')): ''' Proximal Policy Optimization, https://arxiv.org/abs/1707.06347 Emergence of Locomotion Behaviours in Rich Environments, http://arxiv.org/abs/1707.02286, DPPO ''' def __init__(self, s_dim, visual_sources, visual_resolution, a_dim, is_continuous, policy_epoch=4, value_epoch=4, beta=1.0e-3, lr=5.0e-4, lambda_=0.95, epsilon=0.2, value_epsilon=0.2, share_net=True, actor_lr=3e-4, critic_lr=1e-3, kl_reverse=False, kl_target=0.02, kl_target_cutoff=2, kl_target_earlystop=4, kl_beta=[0.7, 1.3], kl_alpha=1.5, kl_coef=1.0, hidden_units={ 'share': { 'continuous': { 'share': [32, 32], 'mu': [32, 32], 'v': [32, 32] }, 'discrete': { 'share': [32, 32], 'logits': [32, 32], 'v': [32, 32] } }, 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__( s_dim=s_dim, visual_sources=visual_sources, visual_resolution=visual_resolution, a_dim=a_dim, is_continuous=is_continuous, **kwargs) self.beta = beta self.policy_epoch = policy_epoch self.value_epoch = value_epoch self.lambda_ = lambda_ self.epsilon = epsilon self.value_epsilon = value_epsilon self.share_net = share_net self.kl_reverse = kl_reverse self.kl_target = kl_target self.kl_alpha = kl_alpha self.kl_coef = tf.constant(kl_coef, dtype=tf.float32) self.kl_cutoff = kl_target * kl_target_cutoff self.kl_stop = kl_target * kl_target_earlystop self.kl_low = kl_target * kl_beta[0] self.kl_high = kl_target * kl_beta[-1] if self.is_continuous: self.log_std = tf.Variable(initial_value=-0.5 * np.ones(self.a_dim, dtype=np.float32), trainable=True) if self.share_net: # self.TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1], [1], [1]) if self.is_continuous: self.net = rls.a_c_v_continuous(self.feat_dim, self.a_dim, hidden_units['share']['continuous']) self.net_tv = self.net.trainable_variables + [self.log_std] + self.other_tv else: self.net = rls.a_c_v_discrete(self.feat_dim, self.a_dim, hidden_units['share']['discrete']) self.net_tv = self.net.trainable_variables + self.other_tv self.lr = self.init_lr(lr) self.optimizer = self.init_optimizer(self.lr) self.model_recorder(dict( model=self.net, optimizer=self.optimizer )) else: # self.actor_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [self.a_dim], [1], [1]) # self.critic_TensorSpecs = get_TensorSpecs([self.s_dim], self.visual_dim, [1]) if self.is_continuous: self.actor_net = rls.actor_mu(self.feat_dim, self.a_dim, hidden_units['actor_continuous']) self.actor_net_tv = self.actor_net.trainable_variables+ [self.log_std] else: self.actor_net = rls.actor_discrete(self.feat_dim, self.a_dim, hidden_units['actor_discrete']) self.actor_net_tv = self.actor_net.trainable_variables self.critic_net = rls.critic_v(self.feat_dim, hidden_units['critic']) self.critic_tv = self.critic_net.trainable_variables + self.other_tv self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr]) self.optimizer_actor, self.optimizer_critic = map(self.init_optimizer, [self.actor_lr, self.critic_lr]) self.model_recorder(dict( actor=self.actor_net, critic=self.critic_net, optimizer_actor=self.optimizer_actor, optimizer_critic=self.optimizer_critic )) self.initialize_data_buffer( data_name_list=['s', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'value', 'log_prob']) def show_logo(self): self.recorder.logger.info(''' xxxxxxxx xxxxxxxx xxxxx xx xx xx xx xxx xxx x xxx x xxx xx xx x xxx x xxx xx xxx xxxxxx xxxxxx xxx xxx x x xx xxx x x xx xx x x xx xxx xxxxx xxxxx xxxxx ''') def choose_action(self, s, visual_s, evaluation=False): a, value, log_prob, self.cell_state = self._get_action(s, visual_s, self.cell_state) a = a.numpy() self._value = np.squeeze(value.numpy()) self._log_prob = np.squeeze(log_prob.numpy()) + 1e-10 return a @tf.function def _get_action(self, s, visual_s, cell_state): with tf.device(self.device): feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True) if self.is_continuous: if self.share_net: mu, value = self.net(feat) else: mu = self.actor_net(feat) value = self.critic_net(feat) sample_op, _ = gaussian_clip_rsample(mu, self.log_std) log_prob = gaussian_likelihood_sum(sample_op, mu, self.log_std) else: if self.share_net: logits, value = self.net(feat) else: logits = self.actor_net(feat) value = self.critic_net(feat) norm_dist = tfp.distributions.Categorical(logits) sample_op = norm_dist.sample() log_prob = norm_dist.log_prob(sample_op) return sample_op, value, log_prob, cell_state def store_data(self, s, visual_s, a, r, s_, visual_s_, done): assert isinstance(a, np.ndarray), "store_data need action type is np.ndarray" assert isinstance(r, np.ndarray), "store_data need reward type is np.ndarray" assert isinstance(done, np.ndarray), "store_data need done type is np.ndarray" self.data.add(s, visual_s, a, r, s_, visual_s_, done, self._value, self._log_prob) @tf.function def _get_value(self, feat): with tf.device(self.device): if self.share_net: _, value = self.net(feat) else: value = self.critic_net(feat) return value def calculate_statistics(self): feat, self.cell_state = self.get_feature(self.data.last_s(), self.data.last_visual_s(), cell_state=self.cell_state, record_cs=True) init_value = np.squeeze(self._get_value(feat).numpy()) self.data.cal_dc_r(self.gamma, init_value) self.data.cal_td_error(self.gamma, init_value) self.data.cal_gae_adv(self.lambda_, self.gamma) # @show_graph(name='ppo_net') def learn(self, **kwargs): self.episode = kwargs['episode'] def _train(data, crsty_loss, cell_state): early_step = 0 if self.share_net: for i in range(self.policy_epoch): actor_loss, critic_loss, entropy, kl = self.train_share( data, self.kl_coef, crsty_loss, cell_state ) if kl > self.kl_stop: early_step = i break else: for i in range(self.policy_epoch): s, visual_s, a, dc_r, old_log_prob, advantage, old_value = data actor_loss, entropy, kl = self.train_actor( (s, visual_s, a, old_log_prob, advantage), self.kl_coef, cell_state ) if kl > self.kl_stop: early_step = i break for _ in range(self.value_epoch): critic_loss = self.train_critic( (s, visual_s, dc_r, old_value), crsty_loss, cell_state ) # https://github.com/joschu/modular_rl/blob/6970cde3da265cf2a98537250fea5e0c0d9a7639/modular_rl/ppo.py#L93 if kl > self.kl_high: self.kl_coef *= self.kl_alpha elif kl < self.kl_low: self.kl_coef /= self.kl_alpha summaries = dict([ ['LOSS/actor_loss', actor_loss], ['LOSS/critic_loss', critic_loss], ['Statistics/kl', kl], ['Statistics/kl_coef', self.kl_coef], ['Statistics/early_step', early_step], ['Statistics/entropy', entropy] ]) return summaries if self.share_net: summary_dict = dict([['LEARNING_RATE/lr', self.lr(self.episode)]]) else: summary_dict =dict([ ['LEARNING_RATE/actor_lr', self.actor_lr(self.episode)], ['LEARNING_RATE/critic_lr', self.critic_lr(self.episode)] ]) self._learn(function_dict={ 'calculate_statistics': self.calculate_statistics, 'train_function': _train, 'train_data_list': ['s', 'visual_s', 'a', 'discounted_reward', 'log_prob', 'gae_adv', 'value'], 'summary_dict': summary_dict }) @tf.function(experimental_relax_shapes=True) def train_share(self, memories, kl_coef, crsty_loss, cell_state): s, visual_s, a, dc_r, old_log_prob, advantage, old_value = memories with tf.device(self.device): with tf.GradientTape() as tape: feat = self.get_feature(s, visual_s, cell_state=cell_state) if self.is_continuous: mu, value = self.net(feat) new_log_prob = gaussian_likelihood_sum(a, mu, self.log_std) entropy = gaussian_entropy(self.log_std) else: logits, value = self.net(feat) logp_all = tf.nn.log_softmax(logits) new_log_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) ratio = tf.exp(new_log_prob - old_log_prob) # https://github.com/joschu/modular_rl/blob/6970cde3da265cf2a98537250fea5e0c0d9a7639/modular_rl/ppo.py#L40 if self.kl_reverse: kl = tf.reduce_mean(new_log_prob - old_log_prob) else: kl = tf.reduce_mean(old_log_prob - new_log_prob) # a sample estimate for KL-divergence, easy to compute surrogate = ratio * advantage # https://github.com/llSourcell/OpenAI_Five_vs_Dota2_Explained/blob/c5def7e57aa70785c2394ea2eeb3e5f66ad59a53/train.py#L154 value_clip = old_value + tf.clip_by_value(value - old_value, -self.value_epsilon, self.value_epsilon) td_error = dc_r - value td_error_clip = dc_r - value_clip td_square = tf.maximum(tf.square(td_error), tf.square(td_error_clip)) pi_loss = -tf.reduce_mean( tf.minimum( surrogate, tf.clip_by_value(ratio, 1.0 - self.epsilon, 1.0 + self.epsilon) * advantage )) kl_loss = kl_coef * kl extra_loss = 1000.0 * tf.square(tf.maximum(0., kl - self.kl_cutoff)) actor_loss = pi_loss + kl_loss + extra_loss value_loss = 0.5 * tf.reduce_mean(td_square) loss = actor_loss + 1.0 * value_loss - self.beta * entropy + crsty_loss loss_grads = tape.gradient(loss, self.net_tv) self.optimizer.apply_gradients( zip(loss_grads, self.net_tv) ) self.global_step.assign_add(1) return actor_loss, value_loss, entropy, kl @tf.function(experimental_relax_shapes=True) def train_actor(self, memories, kl_coef, cell_state): s, visual_s, a, old_log_prob, advantage = memories with tf.device(self.device): feat = self.get_feature(s, visual_s, cell_state=cell_state) with tf.GradientTape() as tape: if self.is_continuous: mu = self.actor_net(feat) new_log_prob = gaussian_likelihood_sum(a, mu, self.log_std) entropy = gaussian_entropy(self.log_std) else: logits = self.actor_net(feat) logp_all = tf.nn.log_softmax(logits) new_log_prob = tf.reduce_sum(a * logp_all, axis=1, keepdims=True) entropy = -tf.reduce_mean(tf.reduce_sum(tf.exp(logp_all) * logp_all, axis=1, keepdims=True)) ratio = tf.exp(new_log_prob - old_log_prob) kl = tf.reduce_mean(old_log_prob - new_log_prob) surrogate = ratio * advantage min_adv = tf.where(advantage > 0, (1 + self.epsilon) * advantage, (1 - self.epsilon) * advantage) pi_loss = -(tf.reduce_mean(tf.minimum(surrogate, min_adv)) + self.beta * entropy) kl_loss = kl_coef * kl extra_loss = 1000.0 * tf.square(tf.maximum(0., kl - self.kl_cutoff)) actor_loss = pi_loss + kl_loss + extra_loss actor_grads = tape.gradient(actor_loss, self.actor_net_tv) self.optimizer_actor.apply_gradients( zip(actor_grads, self.actor_net_tv) ) self.global_step.assign_add(1) return actor_loss, entropy, kl @tf.function(experimental_relax_shapes=True) def train_critic(self, memories, crsty_loss, cell_state): s, visual_s, dc_r, old_value = memories with tf.device(self.device): with tf.GradientTape() as tape: feat = self.get_feature(s, visual_s, cell_state=cell_state) value = self.critic_net(feat) value_clip = old_value + tf.clip_by_value(value-old_value, -self.value_epsilon, self.value_epsilon) td_error = dc_r - value td_error_clip = dc_r - value_clip td_square = tf.maximum(tf.square(td_error), tf.square(td_error_clip)) value_loss = 0.5 * tf.reduce_mean(td_square) + crsty_loss critic_grads = tape.gradient(value_loss, self.critic_tv) self.optimizer_critic.apply_gradients( zip(critic_grads, self.critic_tv) ) return value_loss