Exemple #1
0
            [env.reset()], [], [], []

        while not done:
            a = behavior_policy.action(states[-1])
            s, r, done = env.step(a)

            states.append(s)
            actions.append(a)
            rewards.append(r)

        traj = list(zip(states[:-1],actions,rewards,states[1:]))
        trajs.append(traj)

    # On-poilicy evaluation test
    Q_est_ois = mc_ois(env.spec,trajs,behavior_policy,behavior_policy,np.zeros((env.spec.nS,env.spec.nA)))
    Q_est_wis = mc_wis(env.spec,trajs,behavior_policy,behavior_policy,np.zeros((env.spec.nS,env.spec.nA)))
    V_est_td = ntd(env.spec,trajs,1,0.005,np.zeros((env.spec.nS)))

    assert np.allclose(Q_est_ois,np.array([[0.19,0.],[0.,0.]]),1e-5,1e-1), 'due to stochasticity, this test might fail'
    assert np.allclose(Q_est_wis,np.array([[0.19,0.],[0.,0.]]),1e-5,1e-1), 'due to stochasticity, this test might fail'
    assert np.allclose(Q_est_ois,Q_est_wis), 'Both implementation should be equal in on policy case'
    assert np.allclose(V_est_td,np.array([0.1,0.]),1e-5,1e-1), 'due to stochasticity, this test might fail'

    # Off-policy evaluation test
    Q_est_ois = mc_ois(env.spec,trajs,behavior_policy,eval_policy,np.zeros((env.spec.nS,env.spec.nA)))
    Q_est_wis = mc_wis(env.spec,trajs,behavior_policy,eval_policy,np.zeros((env.spec.nS,env.spec.nA)))

    # Don't panic even though Q_est_ois shows high estimation error. It's expected one!
    print(Q_est_ois)
    print(Q_est_wis)
Exemple #2
0
    V, Q = value_prediction(grid_world, behavior_policy, initV, 1e-12)
    print(V.reshape((4, 4)))

    print("DP value iteration optimal value and policy")
    V, pi = value_iteration(grid_world, initV, 1e-12)
    print(V.reshape((4, 4)))
    print(visualize(pi).reshape((4, 4)))

    # On-policy evaluation tests for random policy
    # OIS
    Q_est_ois = mc_ois(grid_world.spec, trajs, behavior_policy,
                       behavior_policy,
                       np.zeros((grid_world.spec.nS, grid_world.spec.nA)))
    # WIS
    Q_est_wis = mc_wis(grid_world.spec, trajs, behavior_policy,
                       behavior_policy,
                       np.zeros((grid_world.spec.nS, grid_world.spec.nA)))
    # 3-step TD with alpha = 0.005
    V_est_td = ntd(grid_world.spec, trajs, 3, 0.005,
                   np.zeros((grid_world.spec.nS)))

    print("On random policy value OIS: ")
    print(Q2V(Q_est_ois, behavior_policy).reshape((4, 4)))
    print("On random policy value WIS: ")
    print(Q2V(Q_est_wis, behavior_policy).reshape((4, 4)))
    print("3-step TD value estimation on random policy: ")
    print(V_est_td.reshape((4, 4)))

    # Off-policy evaluation test with optimal policy
    Q_est_ois = mc_ois(grid_world.spec, trajs, behavior_policy, pi,
                       np.zeros((grid_world.spec.nS, grid_world.spec.nA)))