Esempio n. 1
0
        optimizer = tf.train.AdamOptimizer(
            learning_rate=args.learning_rate_state_value,
            epsilon=args.adam_eps)

    if args.grad_clip > 0:
        valGradients, valVaribales = zip(
            *optimizer.compute_gradients(stateValueLoss))
        valGradients, _ = tf.clip_by_global_norm(valGradients, args.grad_clip)
        svfOptimizationStep = optimizer.apply_gradients(
            zip(valGradients, valVaribales))
    else:
        svfOptimizationStep = optimizer.minimize(stateValueLoss)

    #other ops
    policyParams = utils.get_vars(policyParamsScope)
    getPolicyParams = utils.flat_concat(policyParams)
    setPolicyParams = utils.assign_params_from_flat(policyParamsFlatten,
                                                    policyParams)

    d, HxOp = utils.hesian_vector_product(KLcontraint, policyParams)
    surrogateFlatLoss = utils.flat_grad(Lloss, policyParams)

    if args.damping_coef > 0:
        HxOp += args.damping_coef * d

    #tf session initialization
    init = tf.initialize_local_variables()
    init2 = tf.initialize_all_variables()
    sess.run([init, init2])

    nextObs = env.reset()
Esempio n. 2
0
            valGradients, valVaribales = zip(*optimizatierVf.compute_gradients(stateValueLoss))  
            valGradients, _ = tf.clip_by_global_norm(valGradients, args.grad_clipping)       
            optimizationStepVf = optimizatierVf.apply_gradients(zip(valGradients, valVaribales))
            
            polGradients, polVaribales = zip(*optimizatierPolicy.compute_gradients(Lloss))  
            polGradients, _ = tf.clip_by_global_norm(polGradients, args.grad_clipping)       
            optimizationStepPolicy = optimizatierPolicy.apply_gradients(zip(polGradients, polVaribales))
        else:
             optimizationStepPolicy = optimizatierPolicy.minimize(Lloss)
             optimizationStepVf = optimizatierVf.minimize(stateValueLoss)
    else:        
        optimizationStepPolicy = tf.train.AdamOptimizer(learning_rate = args.learning_rate_policy, epsilon=args.adam_eps).minimize(Lloss)
        optimizationStepVf = tf.train.AdamOptimizer(learning_rate = args.learning_rate_state_value, epsilon=args.adam_eps).minimize(stateValueLoss)
       
    trainableParams = utils.get_vars("AllTrainableParams")
    getTrainableParams = utils.flat_concat(trainableParams)
    setTrainableParams = utils.assign_params_from_flat(trainableParamsFlatten, trainableParams)
    
    #tf session initialization
    init = tf.initialize_local_variables()
    init2 = tf.initialize_all_variables()
    sess.run([init,init2])
    
    nextObs = env.reset() 
    nextDone = 0      
    epLen = 0
    epTotalRew = 0
    epTotalTrainRews = deque(maxlen = args.test_episodes_with_noise)   

    #algorithm
    for e in range(args.epochs):