vLossUncliped = (vfOutputOp - totalEstimatedDiscountedRewardPh)**2 vClipped = VPrevPh + tf.clip_by_value(vfOutputOp - VPrevPh, -args.val_eps, args.val_eps) vLossClipped = (vClipped - totalEstimatedDiscountedRewardPh)**2 vLossMax = tf.maximum(vLossClipped, vLossUncliped) stateValueLoss = tf.reduce_mean(0.5 * vLossMax) else: stateValueLoss = tf.reduce_mean( (vfOutputOp - totalEstimatedDiscountedRewardPh)**2) if (discreteActionsSpace): KLcontraint = utils.categorical_kl(logProbWithCurrParamsOp, logProbsAllPh) else: KLcontraint = utils.diagonal_gaussian_kl(actionMeanOp, actionLogStdOp, oldActionMeanPh, oldActionLogStdPh) if args.lr_annealing: optimizer = tf.train.AdamOptimizer(learning_rate=learningRatePh, epsilon=args.adam_eps) else: optimizer = tf.train.AdamOptimizer( learning_rate=args.learning_rate_state_value, epsilon=args.adam_eps) if args.grad_clip > 0: valGradients, valVaribales = zip( *optimizer.compute_gradients(stateValueLoss)) valGradients, _ = tf.clip_by_global_norm(valGradients, args.grad_clip) svfOptimizationStep = optimizer.apply_gradients(
if args.plus and args.plus_eps > 0: vLossUncliped = (V.output - totalEstimatedDiscountedRewardPh)**2 vClipped = VPrevPh + tf.clip_by_value(V.output - VPrevPh, -args.plus_eps, args.plus_eps) vLossClipped = (vClipped - totalEstimatedDiscountedRewardPh)**2 vLossMax = tf.maximum(vLossClipped, vLossUncliped) stateValueLoss = tf.reduce_mean(0.5 * vLossMax) else: stateValueLoss = tf.reduce_mean( (V.output - totalEstimatedDiscountedRewardPh)**2) if (discreteActionsSpace): KLcontraint = utils.categorical_kl(policy.logProbs, logProbsAllPh) else: KLcontraint = utils.diagonal_gaussian_kl(policy.actionMean, policy.actionLogStd, oldActionMeanPh, oldActionLogStdPh) if args.plus_plus and args.plus_plus_grad_clip >= 0: optimizer = tf.train.AdamOptimizer(learning_rate=learningRatePh) valGradients, valVaribales = zip( *optimizer.compute_gradients(stateValueLoss)) valGradients, _ = tf.clip_by_global_norm(valGradients, args.plus_plus_grad_clip) svfOptimizationStep = optimizer.apply_gradients( zip(valGradients, valVaribales)) else: svfOptimizationStep = tf.train.AdamOptimizer( args.learning_rate_state_value).minimize(stateValueLoss) #other ops