def apply( self, x, action_dim, max_action, key=None, MPO=False, sample=False, log_sig_min=-20, log_sig_max=2, ): x = nn.Dense(x, features=200) x = nn.LayerNorm(x) x = nn.tanh(x) x = nn.Dense(x, features=200) x = nn.elu(x) x = nn.Dense(x, features=2 * action_dim) mu, log_sig = jnp.split(x, 2, axis=-1) log_sig = nn.softplus(log_sig) log_sig = jnp.clip(log_sig, log_sig_min, log_sig_max) if MPO: return mu, log_sig if not sample: return max_action * nn.tanh(mu), log_sig else: pi = mu + random.normal(key, mu.shape) * jnp.exp(log_sig) log_pi = gaussian_likelihood(pi, mu, log_sig) pi = nn.tanh(pi) log_pi -= jnp.sum(jnp.log(nn.relu(1 - pi ** 2) + 1e-6), axis=1) return max_action * pi, log_pi
def _create_continuous_trainer(self, optimizer=tf.train.AdamOptimizer()): """ Creates a function for vanilla policy training with a continuous action space """ self.act_holders = tf.placeholder( tf.float32, shape=[None, self.out_op.shape[1].value]) self.reward_holders = tf.placeholder(tf.float32, shape=[None]) self.std = tf.Variable(0.5 * np.ones(shape=self.out_op.shape[1].value), dtype=tf.float32) self.out_act = self.out_op + tf.random_normal( tf.shape(self.out_op), dtype=tf.float32) * self.std self.log_probs = gaussian_likelihood(self.act_holders, self.out_op, self.std) self.loss = -tf.reduce_mean(self.log_probs * self.reward_holders) self.optimizer = optimizer self.update = self.optimizer.minimize(self.loss) update_func = lambda train_data: self.sess.run( self.update, feed_dict={ self.in_op: reshape_train_var(train_data[:, 0]), self.act_holders: reshape_train_var(train_data[:, 1]), self.reward_holders: train_data[:, 2] }) self.sess.run(tf.global_variables_initializer()) return update_func
def train(self, state, action, reward, next_state, done): next_mu, next_log_std = self.actor(next_state) next_action = tf.random.normal(tf.shape(next_mu), next_mu, tf.math.exp(next_log_std)) next_log_prob = gaussian_likelihood(next_action, next_mu, next_log_std) next_target_q1 = self.target_critic1(next_state, next_action) next_target_q2 = self.target_critic2(next_state, next_action) min_next_target_q = tf.math.minimum( next_target_q1, next_target_q2) - self.alpha * next_log_prob target_q = reward + (1. - done) * self.gamma * min_next_target_q with tf.GradientTape(persistent=True) as tape: # Critic q1 = self.critic1(state, action) q2 = self.critic2(state, action) critic1_loss = tf.reduce_mean(tf.keras.losses.mse(target_q, q1)) critic2_loss = tf.reduce_mean(tf.keras.losses.mse(target_q, q2)) # Actor min_q = tf.math.minimum(q1, q2) mu, log_std = self.actor(state) log_prob = gaussian_likelihood(action, mu, log_std) actor_loss = tf.reduce_mean(self.alpha * log_prob - min_q) # Alpha alpha_loss = -tf.reduce_mean(self.log_alpha * log_prob) critic1_grads = tape.gradient(critic1_loss, self.critic1.trainable_weights) self.critic1_optimizer.apply_gradients( zip(critic1_grads, self.critic1.trainable_weights)) critic2_grads = tape.gradient(critic2_loss, self.critic2.trainable_weights) self.critic2_optimizer.apply_gradients( zip(critic2_grads, self.critic2.trainable_weights)) actor_grads = tape.gradient(actor_loss, self.actor.trainable_weights) self.actor_optimizer.apply_gradients( zip(actor_grads, self.actor.trainable_weights)) if self.use_dynamic_alpha: alpha_grad = tape.gradient(alpha_loss, self.log_alpha) self.alpha_optimizer.apply_gradients([(alpha_grad, self.log_alpha) ]) return actor_loss, critic1_loss, critic2_loss, alpha_loss
def loss_fn(mlo, slo, actor): mu, log_sig = actor(state, MPO=True) sig = jnp.exp(log_sig) target_mu, target_log_sig = actor_target(state, MPO=True) target_sig = jnp.exp(target_log_sig) actor_log_prob = gaussian_likelihood(sampled_actions, target_mu, sig) actor_log_prob += gaussian_likelihood(sampled_actions, mu, target_sig) actor_log_prob = actor_log_prob.transpose((0, 1)) mu, target_mu = nn.tanh(mu), nn.tanh(mu) reg_mu = eps_mu - kl_mvg_diag(target_mu, target_sig, mu, target_sig).mean() reg_sig = eps_sig - kl_mvg_diag(target_mu, target_sig, target_mu, sig).mean() mlo = lagrange_step(mlo, reg_mu) slo = lagrange_step(slo, reg_sig) actor_loss = -(actor_log_prob[:, None] * weights).sum(axis=1).mean() actor_loss -= mu_lagrange_optimizer.target() * reg_mu actor_loss -= sig_lagrange_optimizer.target() * reg_sig return actor_loss.mean(), (mlo, slo)
kernel_initializer=initializationHidden, name="fc{}".format(i + 2))(curNode) actionMeanOp = tf.layers.Dense( outputLength, kernel_initializer=initializationFinalPolicy, name="outputA")(curNode) actionLogStdOp = tf.get_variable( name="ActionsLogStdDetachedTrainable", initializer=-0.3 * np.ones((1, outputLength), dtype=np.float32), trainable=True) actionStdOp = tf.math.exp(actionLogStdOp) actionFinalOp = actionMeanOp + tf.random_normal( tf.shape(actionMeanOp)) * actionStdOp sampledLogProbsOp = utils.gaussian_likelihood(actionFinalOp, actionMeanOp, actionLogStdOp) logProbWithCurrParamsOp = utils.gaussian_likelihood( aPh, actionMeanOp, actionLogStdOp) #definition of losses to optimize ratio = tf.exp(logProbWithCurrParamsOp - logProbSampPh) Lloss = -tf.reduce_mean( ratio * advPh) # - sign because we want to maximize our objective if args.val_eps > 0: vLossUncliped = (vfOutputOp - totalEstimatedDiscountedRewardPh)**2 vClipped = VPrevPh + tf.clip_by_value(vfOutputOp - VPrevPh, -args.val_eps, args.val_eps) vLossClipped = (vClipped - totalEstimatedDiscountedRewardPh)**2 vLossMax = tf.maximum(vLossClipped, vLossUncliped)
def _create_continuous_trainer(self): """ Creates a function for vanilla policy training with a continuous action space """ # First passthrough self.act_holders = tf.placeholder( tf.float32, shape=[None, self.out_op.shape[1].value]) self.reward_holders = tf.placeholder(tf.float32, shape=[None]) self.std = tf.Variable(0.5 * np.ones(shape=self.out_op.shape[1].value), dtype=tf.float32) self.out_act = self.out_op + tf.random_normal( tf.shape(self.out_op), dtype=tf.float32) * self.std self.log_probs = gaussian_likelihood(self.act_holders, self.out_op, self.std) self.advantages = self.reward_holders - tf.squeeze(self.value_out_op) # Second passthrough self.advatange_holders = tf.placeholder(dtype=tf.float32, shape=self.advantages.shape) self.old_prob_holders = tf.placeholder(dtype=tf.float32, shape=self.log_probs.shape) self.policy_ratio = tf.exp(self.log_probs - self.old_prob_holders) self.clipped_ratio = tf.clip_by_value(self.policy_ratio, 1 - self.clip_val, 1 + self.clip_val) self.min_loss = tf.minimum(self.policy_ratio * self.advatange_holders, self.clipped_ratio * self.advatange_holders) self.optimizer = tf.train.AdamOptimizer() # Actor update self.kl_divergence = tf.reduce_mean(self.old_prob_holders - self.log_probs) self.actor_loss = -tf.reduce_mean(self.min_loss) self.actor_update = self.optimizer.minimize(self.actor_loss) # Value update self.value_loss = tf.reduce_mean( tf.square(self.reward_holders - tf.squeeze(self.value_out_op))) self.value_update = self.optimizer.minimize(self.value_loss) # Combined update self.entropy = -0.5 * tf.reduce_mean( tf.log(2 * np.pi * np.e * self.std)) self.combined_loss = self.actor_loss + self.v_coef * self.value_loss + self.entropy_coef * self.entropy self.combined_update = self.optimizer.minimize(self.combined_loss) def update_func(train_data): self.old_probs, self.old_advantages = self.sess.run( [self.log_probs, self.advantages], feed_dict={ self.in_op: reshape_train_var(train_data[:, 0]), self.act_holders: reshape_train_var(train_data[:, 1]), self.reward_holders: train_data[:, 2] }) for i in range(self.ppo_iters): kl_div, _ = self.sess.run( [self.kl_divergence, self.combined_update], feed_dict={ self.in_op: reshape_train_var(train_data[:, 0]), self.act_holders: reshape_train_var(train_data[:, 1]), self.reward_holders: train_data[:, 2], self.old_prob_holders: self.old_probs, self.advatange_holders: self.old_advantages }) if kl_div > 1.5 * self.target_kl: break return kl_div, self.sess.run(self.entropy) self.sess.run(tf.global_variables_initializer()) return update_func
def _createDefault(self): with tf.variable_scope("PolicyNetworkContinuous{}".format( self.suffix)): if not self.orthogonalInitializtion: curNode = tf.layers.Dense( self.hiddenLayers[0], self.hiddenLayerActivations[0], kernel_initializer=tf.contrib.layers.xavier_initializer(), name="fc1")(self.input) #curNode = tf.contrib.layers.layer_norm(curNode) for i, l in enumerate(self.hiddenLayers[1:]): curNode = tf.layers.Dense( l, self.hiddenLayerActivations[i + 1], kernel_initializer=tf.contrib.layers. xavier_initializer(), name="fc{}".format(i + 2))(curNode) #curNode = tf.contrib.layers.layer_norm(curNode) self.actionMean = tf.layers.Dense( self.outputLength, self.hiddenLayerActivations[-1], kernel_initializer=tf.contrib.layers.xavier_initializer(), name="ActionsMean")(curNode) else: curNode = tf.layers.Dense( self.hiddenLayers[0], self.hiddenLayerActivations[0], kernel_initializer=tf.orthogonal_initializer( self.orthogonalInitializtion[0]), name="fc1")(self.input) #curNode = tf.contrib.layers.layer_norm(curNode) for i, l in enumerate(self.hiddenLayers[1:]): curNode = tf.layers.Dense( l, self.hiddenLayerActivations[i + 1], kernel_initializer=tf.orthogonal_initializer( self.orthogonalInitializtion[i + 1]), name="fc{}".format(i + 2))(curNode) #curNode = tf.contrib.layers.layer_norm(curNode) self.actionMean = tf.layers.Dense( self.outputLength, self.hiddenLayerActivations[-1], kernel_initializer=tf.orthogonal_initializer( self.orthogonalInitializtion[-1]), name="ActionsMean")(curNode) if (self.actionMeanScale is not None): assert (self.actionMeanScale.shape == (1, self.outputLength)) self.actionMean = self.actionMean * self.actionMeanScale #logic for noise that is added to action mean if self.logStdInit is not None: assert (self.logStdInit.shape == (1, self.outputLength)) self.actionLogStd = tf.get_variable( name="ActionsLogStdDetached{}Trainable".format( "" if self.logStdTrainable else "Non"), initializer=self.logStdInit, trainable=self.logStdTrainable) else: if not self.orthogonalInitializtion: self.actionLogStd = tf.layers.Dense( self.outputLength, kernel_initializer=tf.contrib.layers. xavier_initializer(), name="ActionsLogStd")(curNode) else: self.actionLogStd = tf.layers.Dense( self.outputLength, kernel_initializer=tf.orthogonal_initializer( self.orthogonalInitializtion[-1]), name="ActionsLogStd")(curNode) if self.clipLogStd is not None: self.actionLogStd = tf.clip_by_value( self.actionLogStd, self.clipLogStd[0], self.clipLogStd[1], name="ClipedActionsLogStd") #here we actualy add noise if self.actionLogStd is not None: self.actionStd = tf.math.exp(self.actionLogStd) self.actionRaw = self.actionMean + tf.random_normal( tf.shape(self.actionMean)) * self.actionStd else: self.actionRaw = self.actionMean #action clip if self.actionClip is not None: assert (self.actionClip.shape == (2, self.outputLength)) self.actionFinal = tf.clip_by_value(self.actionFinal, self.actionClip[0, :], self.actionClip[1, :]) else: self.actionFinal = self.actionRaw #if adding std to action mean, operations for action probabilities if self.actionLogStd is not None: self.sampledLogProbs = utils.gaussian_likelihood( self.actionFinal, self.actionMean, self.actionLogStd) self.logProbWithCurrParams = utils.gaussian_likelihood( self.actions, self.actionMean, self.actionLogStd ) #log prob(joint, all action components are from gaussian) for action given the observation(both fed with placeholder)