def run_rl():
    walls = [5, 6, 13]
    height = 4
    width = 5
    m = build_maze(width, height, walls, hit=True)

    q = policy_iteration_q(m, render=False)
    pol = get_policy_from_q(q)
    print("TD-learning")
    temporal_difference(m, pol, render=True)
    print("Q-learning")
    q_learning(m, tau=6)
    print("Sarsa")
    sarsa(m, tau=6)
    input("press enter")
Esempio n. 2
0
def run_rl():
    walls = [5, 6, 13]
    height = 4
    width = 5
    m = build_maze(width, height, walls, hit=True)
    # m = create_maze(8, 8, 0.2)
    print("1")
    q,_,_,_ = policy_iteration_q(m, render=0)
    print("1")
    pol = get_policy_from_q(q)
    # print("TD-learning")
    # temporal_difference(m, pol, render=True)
    # input("press enter")
    # print("Q-learning")
    # q_learning_eps(m, tau=6)
    plot_ql_sarsa_para(m, 0.001, 6, 1000, 50, 0.5, False)
Esempio n. 3
0
def q_learning_soft(mdp, tau,gamma, nb_episodes=20, timeout=50, alpha=0.5, render=True):
    # Initialize the state-action value function
    # alpha is the learning rate
    q = np.zeros((mdp.nb_states, mdp.action_space.size))
    q_min = np.zeros((mdp.nb_states, mdp.action_space.size))
    q_list = []

    # Run learning cycle
    mdp.timeout = timeout  # episode length

    if render:
        mdp.new_render()

    for _ in range(nb_episodes):
        # print(i)
        # Draw the first state of episode i using a uniform distribution over all the states
        x = mdp.reset(uniform=True)
        done = mdp.done()
        while not done:
            if render:
                # Show the agent in the maze
                mdp.render(q, q.argmax(axis=1))

            # Draw an action using a soft-max policy
            u = mdp.action_space.sample(prob_list=softmax(q, x, tau))

            # Perform a step of the MDP
            [y, r, done, _] = mdp.step(u)

            # Update the state-action value function with q-Learning
            # TODO
            if x in mdp.terminal_states:
                q[x, u] = r
            else:
                delta = r + gamma * np.max(q[y]) - q[x,u]
                q[x, u] = q[x,u] + alpha*delta

            # Update the agent position
            x = y
        q_list.append(np.linalg.norm(np.maximum(q, q_min)))

    if render:
        # Show the final policy
        mdp.current_state = 0
        mdp.render(q, get_policy_from_q(q))
    return q, q_list
def sarsa(mdp, tau, nb_episodes=20, timeout=50, alpha=0.5, render=True):
    # Initialize the state-action value function
    # alpha is the learning rate
    q = np.zeros((mdp.nb_states, mdp.action_space.size))

    # Run learning cycle
    mdp.timeout = timeout  # episode length

    if render:
        mdp.new_render()

    for _ in range(nb_episodes):
        # Draw the first state of episode i using a uniform distribution over all the states
        x = mdp.reset(uniform=True)
        done = mdp.done()

        # Draw an action using a soft-max policy
        u = mdp.action_space.sample(prob_list=softmax(q, x, tau))
        while not done:
            if render:
                # Show the agent in the maze
                mdp.render(q, q.argmax(axis=1))

            # Perform a step of the MDP
            [y, r, done, _] = mdp.step(u)

            # Update the state-action value function with q-Learning
            if x in mdp.terminal_states:
                q[x, u] = # TODO : fill this
            else:
                # Draw an action using a soft-max policy
                u2 = mdp.action_space.sample(prob_list=softmax(q, y, tau))
                delta = # TODO : fill this
                q[x, u] = # TODO : fill this
                u = u2

            # Update the agent position
            x = y

    if render:
        # Show the final policy
        mdp.current_state = 0
        mdp.render(q, get_policy_from_q(q))
    return q
Esempio n. 5
0
def sarsa_eps(mdp, epsilon, nb_episodes=20, timeout=50, alpha=0.5, render=True):
    # Initialize the state-action value function
    # alpha is the learning rate
    q = np.zeros((mdp.nb_states, mdp.action_space.size))
    q_min = np.zeros((mdp.nb_states, mdp.action_space.size))
    q_list = []
    # Run learning cycle
    mdp.timeout = timeout  # episode length
    if render:
        mdp.new_render()
    for i in range(nb_episodes):
        print(i)
        # Draw the first state of episode i using a uniform distribution over all the states
        x = mdp.reset(uniform=True)
        ux = 0
        done = mdp.done()
        while not done:
            if render:
                # Show the agent in the maze
                mdp.render(q, q.argmax(axis=1))
            # Draw an action using a egreedy policy
            u = egreedy(q, x, epsilon)
            # Perform a step of the MDP
            [y, r, done, _] = mdp.step(u)
            # Update the state-action value function with q-Learning
            if x in mdp.terminal_states:
                q[x, u] = r
            else:
                uy = egreedy(q, y, epsilon)
                delta = r + mdp.gamma * q[y,uy] - q[x,u]
                q[x, u] = q[x,u] + alpha*delta
            # Update the agent position
            x = y
        q_list.append(np.linalg.norm(np.maximum(q, q_min)))
    if render:
        # Show the final policy
        mdp.current_state = 0
        mdp.render(q, get_policy_from_q(q))
    return q, q_list