コード例 #1
0
    discrete_actions = np.array([1., 2., 3.9], dtype=theano.config.floatX).reshape(-1,
                                                                                   1)  # discretization of the actions
    # to be used for maximum estimate
    # print(s,a,nexts,r,discrete_actions)

    q_model = LQRRegressor(theta)  # q-function

    pfpo = EmpiricalBellmanResidualMinimization(q_model=q_model,
                                                discrete_actions=discrete_actions,
                                                gamma=gamma, optimizer="adam",
                                                state_dim=1, action_dim=1)
    start = time()
    pfpo._make_additional_functions()
    print('compilation time: {}'.format(time() - start))

    check_v(pfpo.F_q(s, a), lqr_reg(s, a, [q_model.theta.eval()]))

    print('\n--- checking bellman error')
    berr = pfpo.F_bellman_err(s, a, nexts, r, discrete_actions)
    tv = empirical_bop(s, a, r, nexts, discrete_actions, gamma, lqr_reg, [q_model.theta.eval()])
    check_v(berr, tv, 1)

    print('\n--- checking gradient of the bellman error')
    berr_grad = pfpo.F_grad_bellman_berr(s, a, nexts, r, discrete_actions)
    eps = np.sqrt(np.finfo(float).eps)
    f = lambda x: empirical_bop(s, a, r, nexts, discrete_actions, gamma, lqr_reg, [x])
    approx_grad = optimize.approx_fprime(q_model.theta.eval().ravel(), f, eps).reshape(berr_grad[0].shape)
    check_v(berr_grad, approx_grad, 1)

    print()
    print('--' * 30)