Ejemplo n.º 1
0
        vLossUncliped = (vfOutputOp - totalEstimatedDiscountedRewardPh)**2
        vClipped = VPrevPh + tf.clip_by_value(vfOutputOp - VPrevPh,
                                              -args.val_eps, args.val_eps)
        vLossClipped = (vClipped - totalEstimatedDiscountedRewardPh)**2
        vLossMax = tf.maximum(vLossClipped, vLossUncliped)
        stateValueLoss = tf.reduce_mean(0.5 * vLossMax)
    else:
        stateValueLoss = tf.reduce_mean(
            (vfOutputOp - totalEstimatedDiscountedRewardPh)**2)

    if (discreteActionsSpace):
        KLcontraint = utils.categorical_kl(logProbWithCurrParamsOp,
                                           logProbsAllPh)
    else:
        KLcontraint = utils.diagonal_gaussian_kl(actionMeanOp, actionLogStdOp,
                                                 oldActionMeanPh,
                                                 oldActionLogStdPh)

    if args.lr_annealing:
        optimizer = tf.train.AdamOptimizer(learning_rate=learningRatePh,
                                           epsilon=args.adam_eps)
    else:
        optimizer = tf.train.AdamOptimizer(
            learning_rate=args.learning_rate_state_value,
            epsilon=args.adam_eps)

    if args.grad_clip > 0:
        valGradients, valVaribales = zip(
            *optimizer.compute_gradients(stateValueLoss))
        valGradients, _ = tf.clip_by_global_norm(valGradients, args.grad_clip)
        svfOptimizationStep = optimizer.apply_gradients(
Ejemplo n.º 2
0
    if args.plus and args.plus_eps > 0:
        vLossUncliped = (V.output - totalEstimatedDiscountedRewardPh)**2
        vClipped = VPrevPh + tf.clip_by_value(V.output - VPrevPh,
                                              -args.plus_eps, args.plus_eps)
        vLossClipped = (vClipped - totalEstimatedDiscountedRewardPh)**2
        vLossMax = tf.maximum(vLossClipped, vLossUncliped)
        stateValueLoss = tf.reduce_mean(0.5 * vLossMax)
    else:
        stateValueLoss = tf.reduce_mean(
            (V.output - totalEstimatedDiscountedRewardPh)**2)

    if (discreteActionsSpace):
        KLcontraint = utils.categorical_kl(policy.logProbs, logProbsAllPh)
    else:
        KLcontraint = utils.diagonal_gaussian_kl(policy.actionMean,
                                                 policy.actionLogStd,
                                                 oldActionMeanPh,
                                                 oldActionLogStdPh)

    if args.plus_plus and args.plus_plus_grad_clip >= 0:
        optimizer = tf.train.AdamOptimizer(learning_rate=learningRatePh)
        valGradients, valVaribales = zip(
            *optimizer.compute_gradients(stateValueLoss))
        valGradients, _ = tf.clip_by_global_norm(valGradients,
                                                 args.plus_plus_grad_clip)
        svfOptimizationStep = optimizer.apply_gradients(
            zip(valGradients, valVaribales))
    else:
        svfOptimizationStep = tf.train.AdamOptimizer(
            args.learning_rate_state_value).minimize(stateValueLoss)

    #other ops