[env.reset()], [], [], [] while not done: a = behavior_policy.action(states[-1]) s, r, done = env.step(a) states.append(s) actions.append(a) rewards.append(r) traj = list(zip(states[:-1],actions,rewards,states[1:])) trajs.append(traj) # On-poilicy evaluation test Q_est_ois = mc_ois(env.spec,trajs,behavior_policy,behavior_policy,np.zeros((env.spec.nS,env.spec.nA))) Q_est_wis = mc_wis(env.spec,trajs,behavior_policy,behavior_policy,np.zeros((env.spec.nS,env.spec.nA))) V_est_td = ntd(env.spec,trajs,1,0.005,np.zeros((env.spec.nS))) assert np.allclose(Q_est_ois,np.array([[0.19,0.],[0.,0.]]),1e-5,1e-1), 'due to stochasticity, this test might fail' assert np.allclose(Q_est_wis,np.array([[0.19,0.],[0.,0.]]),1e-5,1e-1), 'due to stochasticity, this test might fail' assert np.allclose(Q_est_ois,Q_est_wis), 'Both implementation should be equal in on policy case' assert np.allclose(V_est_td,np.array([0.1,0.]),1e-5,1e-1), 'due to stochasticity, this test might fail' # Off-policy evaluation test Q_est_ois = mc_ois(env.spec,trajs,behavior_policy,eval_policy,np.zeros((env.spec.nS,env.spec.nA))) Q_est_wis = mc_wis(env.spec,trajs,behavior_policy,eval_policy,np.zeros((env.spec.nS,env.spec.nA))) # Don't panic even though Q_est_ois shows high estimation error. It's expected one! print(Q_est_ois) print(Q_est_wis)
V, Q = value_prediction(grid_world, behavior_policy, initV, 1e-12) print(V.reshape((4, 4))) print("DP value iteration optimal value and policy") V, pi = value_iteration(grid_world, initV, 1e-12) print(V.reshape((4, 4))) print(visualize(pi).reshape((4, 4))) # On-policy evaluation tests for random policy # OIS Q_est_ois = mc_ois(grid_world.spec, trajs, behavior_policy, behavior_policy, np.zeros((grid_world.spec.nS, grid_world.spec.nA))) # WIS Q_est_wis = mc_wis(grid_world.spec, trajs, behavior_policy, behavior_policy, np.zeros((grid_world.spec.nS, grid_world.spec.nA))) # 3-step TD with alpha = 0.005 V_est_td = ntd(grid_world.spec, trajs, 3, 0.005, np.zeros((grid_world.spec.nS))) print("On random policy value OIS: ") print(Q2V(Q_est_ois, behavior_policy).reshape((4, 4))) print("On random policy value WIS: ") print(Q2V(Q_est_wis, behavior_policy).reshape((4, 4))) print("3-step TD value estimation on random policy: ") print(V_est_td.reshape((4, 4))) # Off-policy evaluation test with optimal policy Q_est_ois = mc_ois(grid_world.spec, trajs, behavior_policy, pi, np.zeros((grid_world.spec.nS, grid_world.spec.nA)))