def run_rl(): walls = [5, 6, 13] height = 4 width = 5 m = build_maze(width, height, walls, hit=True) q = policy_iteration_q(m, render=False) pol = get_policy_from_q(q) print("TD-learning") temporal_difference(m, pol, render=True) print("Q-learning") q_learning(m, tau=6) print("Sarsa") sarsa(m, tau=6) input("press enter")
def run_rl(): walls = [5, 6, 13] height = 4 width = 5 m = build_maze(width, height, walls, hit=True) # m = create_maze(8, 8, 0.2) print("1") q,_,_,_ = policy_iteration_q(m, render=0) print("1") pol = get_policy_from_q(q) # print("TD-learning") # temporal_difference(m, pol, render=True) # input("press enter") # print("Q-learning") # q_learning_eps(m, tau=6) plot_ql_sarsa_para(m, 0.001, 6, 1000, 50, 0.5, False)
def q_learning_soft(mdp, tau,gamma, nb_episodes=20, timeout=50, alpha=0.5, render=True): # Initialize the state-action value function # alpha is the learning rate q = np.zeros((mdp.nb_states, mdp.action_space.size)) q_min = np.zeros((mdp.nb_states, mdp.action_space.size)) q_list = [] # Run learning cycle mdp.timeout = timeout # episode length if render: mdp.new_render() for _ in range(nb_episodes): # print(i) # Draw the first state of episode i using a uniform distribution over all the states x = mdp.reset(uniform=True) done = mdp.done() while not done: if render: # Show the agent in the maze mdp.render(q, q.argmax(axis=1)) # Draw an action using a soft-max policy u = mdp.action_space.sample(prob_list=softmax(q, x, tau)) # Perform a step of the MDP [y, r, done, _] = mdp.step(u) # Update the state-action value function with q-Learning # TODO if x in mdp.terminal_states: q[x, u] = r else: delta = r + gamma * np.max(q[y]) - q[x,u] q[x, u] = q[x,u] + alpha*delta # Update the agent position x = y q_list.append(np.linalg.norm(np.maximum(q, q_min))) if render: # Show the final policy mdp.current_state = 0 mdp.render(q, get_policy_from_q(q)) return q, q_list
def sarsa(mdp, tau, nb_episodes=20, timeout=50, alpha=0.5, render=True): # Initialize the state-action value function # alpha is the learning rate q = np.zeros((mdp.nb_states, mdp.action_space.size)) # Run learning cycle mdp.timeout = timeout # episode length if render: mdp.new_render() for _ in range(nb_episodes): # Draw the first state of episode i using a uniform distribution over all the states x = mdp.reset(uniform=True) done = mdp.done() # Draw an action using a soft-max policy u = mdp.action_space.sample(prob_list=softmax(q, x, tau)) while not done: if render: # Show the agent in the maze mdp.render(q, q.argmax(axis=1)) # Perform a step of the MDP [y, r, done, _] = mdp.step(u) # Update the state-action value function with q-Learning if x in mdp.terminal_states: q[x, u] = # TODO : fill this else: # Draw an action using a soft-max policy u2 = mdp.action_space.sample(prob_list=softmax(q, y, tau)) delta = # TODO : fill this q[x, u] = # TODO : fill this u = u2 # Update the agent position x = y if render: # Show the final policy mdp.current_state = 0 mdp.render(q, get_policy_from_q(q)) return q
def sarsa_eps(mdp, epsilon, nb_episodes=20, timeout=50, alpha=0.5, render=True): # Initialize the state-action value function # alpha is the learning rate q = np.zeros((mdp.nb_states, mdp.action_space.size)) q_min = np.zeros((mdp.nb_states, mdp.action_space.size)) q_list = [] # Run learning cycle mdp.timeout = timeout # episode length if render: mdp.new_render() for i in range(nb_episodes): print(i) # Draw the first state of episode i using a uniform distribution over all the states x = mdp.reset(uniform=True) ux = 0 done = mdp.done() while not done: if render: # Show the agent in the maze mdp.render(q, q.argmax(axis=1)) # Draw an action using a egreedy policy u = egreedy(q, x, epsilon) # Perform a step of the MDP [y, r, done, _] = mdp.step(u) # Update the state-action value function with q-Learning if x in mdp.terminal_states: q[x, u] = r else: uy = egreedy(q, y, epsilon) delta = r + mdp.gamma * q[y,uy] - q[x,u] q[x, u] = q[x,u] + alpha*delta # Update the agent position x = y q_list.append(np.linalg.norm(np.maximum(q, q_min))) if render: # Show the final policy mdp.current_state = 0 mdp.render(q, get_policy_from_q(q)) return q, q_list