def test_value_and_policy_iteration_gridworld(sx, sy, gamma): # Tolerance tol = 1e-8 # Environment env = GridWorld(nrows=sx, ncols=sy) dp_agent_val = DynProgAgent(env, gamma=gamma, method='value-iteration') dp_agent_pol = DynProgAgent(env, gamma=gamma, method='policy-iteration') V_value_it, _ = dp_agent_val.train(val_it_tol=tol) V_pol_it, _ = dp_agent_pol.train() assert np.allclose(V_value_it, V_pol_it, atol=tol, rtol=1e2 * tol)
def test_value_and_policy_iteration(gamma, seed, Ns, Na): # Tolerance tol = 1e-8 # Environment env = ToyEnv2(Ns, Na, seed) dp_agent_val = DynProgAgent(env, gamma=gamma, method='value-iteration') dp_agent_pol = DynProgAgent(env, gamma=gamma, method='policy-iteration') V_value_it, _ = dp_agent_val.train(val_it_tol=tol) V_pol_it, _ = dp_agent_pol.train() assert dp_agent_val.policy == dp_agent_pol.policy assert np.allclose(V_value_it, V_pol_it, atol=tol, rtol=1e1 * tol)
assert nrows >= 3 assert ncols >= 3 # defining walls middle_col = ncols // 2 middle_row = nrows // 2 walls = () for row in range(nrows): if row != middle_row: walls += ((row, middle_col), ) # super().__init__(seed_val, nrows, ncols, start_coord, terminal_states, success_probability, reward_at, walls, default_reward, enable_render) if __name__ == '__main__': gw = TwoRoomDense(9, 9, success_probability=1.0) from rlplan.agents.planning import DynProgAgent dynprog = DynProgAgent(gw, method='policy-iteration', gamma=0.9) V, _ = dynprog.train() gw.display_values(V) # run gw.render(mode='auto', policy=dynprog.policy) # reset gw.reset()
from rlplan.envs.toy import ToyEnv1 from rlplan.envs.gridworld import GridWorld from rlplan.envs import Chain from rlplan.prediction import TabularTD # Discount factor gamma = 0.9 # Create environment # env = Chain(10) # env = ToyEnv1() env = GridWorld(success_probability=0.9, nrows=4, ncols=4, walls=((1, 1), )) # Initialize and train dynamic programming agent dp_agent = DynProgAgent(env, method='policy-iteration', gamma=gamma) V_dp, _ = dp_agent.train() # Initialize and train q-learning agent ql_agent = QLearningAgent(env, gamma=gamma, learning_rate=None, min_learning_rate=0.1, epsilon=0.2) training_info = ql_agent.train(n_steps=1e5) V_ql = training_info['V'] # Use tabular TD tab_td = TabularTD(env, dp_agent.policy, gamma, lambd=0.9,
""" action_freq = np.zeros((env.Ns, env.Na)) H = len(env.history) for ii in range(H): state, action, reward, next_state, done = env.history[ii] action_freq[state, action] += 1.0 for state in range(env.Ns): action_freq[ state, :] = action_freq[state, :] / action_freq[state, :].sum() return action_freq if __name__ == '__main__': from rlplan.envs import GridWorld from rlplan.agents.planning import DynProgAgent env = GridWorld() dp_agent = DynProgAgent(env, method='policy-iteration', gamma=0.9) dp_agent.train() env.track = True for step in range(15): env.step(dp_agent.policy.sample(env.state)) draw_gridworld_history(env) env.render('manual')