learning_rate=args.learning_rate_state_value, epsilon=args.adam_eps) if args.grad_clip > 0: valGradients, valVaribales = zip( *optimizer.compute_gradients(stateValueLoss)) valGradients, _ = tf.clip_by_global_norm(valGradients, args.grad_clip) svfOptimizationStep = optimizer.apply_gradients( zip(valGradients, valVaribales)) else: svfOptimizationStep = optimizer.minimize(stateValueLoss) #other ops policyParams = utils.get_vars(policyParamsScope) getPolicyParams = utils.flat_concat(policyParams) setPolicyParams = utils.assign_params_from_flat(policyParamsFlatten, policyParams) d, HxOp = utils.hesian_vector_product(KLcontraint, policyParams) surrogateFlatLoss = utils.flat_grad(Lloss, policyParams) if args.damping_coef > 0: HxOp += args.damping_coef * d #tf session initialization init = tf.initialize_local_variables() init2 = tf.initialize_all_variables() sess.run([init, init2]) nextObs = env.reset() nextDone = 0 epLen = 0
valGradients, _ = tf.clip_by_global_norm(valGradients, args.grad_clipping) optimizationStepVf = optimizatierVf.apply_gradients(zip(valGradients, valVaribales)) polGradients, polVaribales = zip(*optimizatierPolicy.compute_gradients(Lloss)) polGradients, _ = tf.clip_by_global_norm(polGradients, args.grad_clipping) optimizationStepPolicy = optimizatierPolicy.apply_gradients(zip(polGradients, polVaribales)) else: optimizationStepPolicy = optimizatierPolicy.minimize(Lloss) optimizationStepVf = optimizatierVf.minimize(stateValueLoss) else: optimizationStepPolicy = tf.train.AdamOptimizer(learning_rate = args.learning_rate_policy, epsilon=args.adam_eps).minimize(Lloss) optimizationStepVf = tf.train.AdamOptimizer(learning_rate = args.learning_rate_state_value, epsilon=args.adam_eps).minimize(stateValueLoss) trainableParams = utils.get_vars("AllTrainableParams") getTrainableParams = utils.flat_concat(trainableParams) setTrainableParams = utils.assign_params_from_flat(trainableParamsFlatten, trainableParams) #tf session initialization init = tf.initialize_local_variables() init2 = tf.initialize_all_variables() sess.run([init,init2]) nextObs = env.reset() nextDone = 0 epLen = 0 epTotalRew = 0 epTotalTrainRews = deque(maxlen = args.test_episodes_with_noise) #algorithm for e in range(args.epochs): print("Epoch {} started".format(e))