Beispiel #1
0
def run_Agent(param):
    '''Run the agent for several transitions, depending on
    the value of param.  It uses the policy from VI.
    Return True if more turns can still be taken.'''
    global Agent_state, Terminal_state
    for i in range(param):
        if Agent_state == Terminal_state:
            print("Terminal state reached!")
            return False
        a = VI.apply_policy(Agent_state)
        Agent_turn(a)
    return True
Beispiel #2
0
def run_QL_agent(param, action=None):
    '''Return True if more turns can still be taken.'''
    global TERMINATED
    #print("In run_QL_agent, action = "+action)
    global Agent_state, Terminal_state
    for i in range(param):
        if Agent_state == Terminal_state:
            print("Terminal state reached!")
            TERMINATED = True
            return False
        if action:
            a = action
        else:
            a = VI.apply_policy(
                Agent_state)  # Should prob. use a different policy method.
        Agent_turn(a)
        #print("Need to perform a Q update here.")
    return True
Beispiel #3
0
def MDP_command(cmd, param):
    global GAMMA, ALL_STATES, CLOSED
    global ACTIONS, NOISE, LIVING_REWARD, NGOALS, SILVER_PATH, N_disks
    global V_from_VI, Q_from_VI, V_from_QL, Q_from_QL, POLICY_from_VI, POLICY_from_QL
    global Agent_state, n_iterations, NEED_Q_LEARN_SETUP, LAST_REWARD, TERMINATED, Terminal_state
    global ALPHA, EPSILON, QUIET_MODE
    #print("In MDP_command, cmd = "+cmd+"; param = "+str(param))
    if cmd == "NDISKS":
        N_disks = param
        TowersOfHanoi.N_disks = param
        try:
            Vis.unhighlight(Agent_state)
        except:
            pass
        set_up_state_space()
        return
    if cmd == "noise":
        NOISE = param
    if cmd == "ngoals":
        NGOALS = param
        if NGOALS == 2: SILVER_PATH = make_solution_path(path_type="silver")
        else: SILVER_PATH = []

    if cmd == "living_reward":
        LIVING_REWARD = param
    if cmd == "set_gamma":
        GAMMA = param
        update_qlearn_params()
        return

    if cmd == "show_values":
        if param == 1:
            Vis.display_values(V_from_VI)
            #for s in V_from_VI.keys(): Vis.reshow_state(s,V_from_VI[s])

        if param == 2:
            Vis.show_q_values(Q_from_VI, CLOSED)
            #Vis.reshow_all_q_values(Q_from_VI, CLOSED)

        if param == 3:
            compute_V_from_QL()
            Vis.display_values(V_from_QL)
            #for s in V_from_QL.keys(): Vis.reshow_state(s,V_from_QL[s])

        if param == 4:
            Vis.show_q_values(Q_from_QL, CLOSED)
            #Vis.reshow_all_q_values(Q_from_QL, CLOSED)
        return
    if cmd == "Value_Iteration":
        if param == 0:  # Reset VI state values to 0.
            n_iterations = 0
            initialize_V_from_VI(0)
            init_q_values(Q_from_VI)
            if Vis.DISPLAY_VALS_VAR.get() == 1:
                Vis.display_values(V_from_VI)
            elif Vis.DISPLAY_VALS_VAR.get() == 2:
                Vis.show_q_values(Q_from_VI, CLOSED)
            Vis.enable_value_iteration(True)
            Vis.enable_vi_action_menu_items(False)
            update_policy_displays(which="VI")
            return
        if param == 1:
            (V_from_VI, max_delta) = VI.one_step_of_VI(ALL_STATES, ACTIONS, T,
                                                       R, GAMMA,
                                                       V_from_VI.copy())
            n_iterations += 1
            print("After " + str(n_iterations) + " iterations, max_delta = " +
                  str(max_delta))
            Vis.enable_policy_extraction(True)
            Q_from_VI = VI.return_Q_values(CLOSED, ACTIONS)
            update_policy_displays(which="VI")
        if param > 1:
            for i in range(param):
                (V_from_VI,
                 max_delta) = VI.one_step_of_VI(ALL_STATES, ACTIONS, T, R,
                                                GAMMA, V_from_VI.copy())
                n_iterations += 1
                print("After " + str(n_iterations) +
                      " iterations, max_delta = " + str(max_delta))
                if max_delta < 0.00000001:
                    print("VI has converged after iteration " +
                          str(n_iterations) + ".")
                    break
            Vis.enable_policy_extraction(True)
            Q_from_VI = VI.return_Q_values(CLOSED, ACTIONS)
            update_policy_displays(which="VI")
        # Update the display of values or q-values, whichever is enabled currently.
        mode = Vis.DISPLAY_VALS_VAR.get()
        if mode == 1:
            for s in V_from_VI.keys():
                Vis.reshow_state(s, V_from_VI[s])
        if mode == 2:
            Vis.show_q_values(Q_from_VI, CLOSED)
            return
    if cmd == "Show_Policy_from_VI":  # THIS CMD SHOULD ACTUALLY BE UNNECESSARY NOW.
        update_policy_displays(which="VI")
    if cmd == "Show_Policy_from_QL":  # THIS CMD SHOULD ACTUALLY BE UNNECESSARY NOW.
        update_policy_displays(which="QL")
    if cmd == "Agent":
        if param == 0 or Agent_state == Terminal_state:
            Vis.unhighlight(Agent_state)
            Agent_turn(ACTIONS[0], reset=True)
            initialize_episode()
        elif param == 1:
            a = VI.apply_policy(Agent_state)
            Agent_turn(a)
        else:
            Vis.TK_Canvas.after(10, lambda: run_Agent(param))
    if cmd == "QLearn":
        init_Q_Learn_if_needed()
        if param == -1 or Agent_state == Terminal_state:  # Reset the agent to s0, ready for a new episode.
            Vis.unhighlight(Agent_state)
            Agent_turn(ACTIONS[0], reset=True)
            initialize_episode()
        elif param == -2:
            # Reset all state and Q values to 0.
            init_q_values(Q_from_QL, QL=True)
            initialize_V_from_QL(0)
            #Vis.reshow_all_q_values(Q_from_QL, CLOSED)
            if Vis.DISPLAY_VALS_VAR.get() == 3:
                compute_V_from_QL()
                Vis.display_values(V_from_QL)
                return
            if Vis.DISPLAY_VALS_VAR.get() == 4:
                Vis.show_q_values(Q_from_QL, CLOSED)
            update_policy_displays(which="QL")
            return
        elif param == 0:
            user_drives_agent_via_text_input()


#    elif param==1:
#      a = Q_Learn.choose_next_action(Agent_state, LAST_REWARD, TERMINATED)
#      Agent_turn(a)
#      increment_transition_count()
        elif param > 0:  # Perform up to n transitions of Q learning.
            for i in range(param):
                a = Q_Learn.choose_next_action(Agent_state, LAST_REWARD,
                                               TERMINATED)
                Agent_turn(a)
                if TERMINATED:
                    # Make one more call to the Q_Learn agent so it can do a q-update based
                    # on the reward in going from a goal state to the Terminal_state.
                    # The returned "action" a should be None, but probably does not matter.
                    a = Q_Learn.choose_next_action(Agent_state, LAST_REWARD,
                                                   TERMINATED)
                    print("Sent final reward for this episode: R=" +
                          str(LAST_REWARD))
                    print("Episode ended after transition " +
                          str(get_transition_count()))
                    increment_episode_count()
                    print(
                        str(get_episode_count()) +
                        " episodes so far in this Q-learning run.")
                    TERMINATED = False  # Make it easier to start the next set of transitions.
                    break
                increment_transition_count()
            update_policy_displays(which="QL")
        elif param == -1000:
            # Do 1000 transitions as quickly as possible, using as many episodes
            # as needed.
            train_quietly(1000)
            update_policy_displays(which="QL")
            return
    if cmd == "Exploration":
        if Vis.EXPL_VAR.get():
            init_q_values(Q_from_QL)
            mode = Vis.DISPLAY_VALS_VAR.get()
            if mode == 4:
                Vis.reshow_all_q_values(Q_from_QL)

            Q_Learn.setup(ALL_STATES,
                          ACTIONS,
                          Q_from_QL,
                          update_q_value,
                          is_valid_goal_state,
                          Terminal_state,
                          use_exp_fn=True)
            update_policy_displays(which="QL")
    if cmd == "alpha":
        if param == 1:
            ALPHA = 0.1
        elif param == 2:
            ALPHA = 0.2
        elif param == 3:
            ALPHA = -1
        update_qlearn_params()
        return
    if cmd == "epsilon":
        if param == 1:
            EPSILON = 0.1
        elif param == 2:
            EPSILON = 0.2
        elif param == 3:
            EPSILON = -1
        update_qlearn_params()
        return
    if cmd == "User_chose":
        init_Q_Learn_if_needed()
        a = param
        Agent_turn(a)
        increment_transition_count()
        Q_Learn.handle_transition(a, Agent_state, LAST_REWARD)
        update_policy_displays(which="QL")
    if cmd == "Get_Q_Values":
        return ((ALL_STATES, Q_VALUES)
                )  # Needs updating to refer to one of the types of Q values.
    if cmd == "compare":
        #Compare_QLearn_to_VI.receive_globals(globals())
        #Q_from_VI = VI.return_Q_values(CLOSED, ACTIONS)
        compute_V_from_QL()
        Compare_QLearn_to_VI.full_compare()
    if cmd == "Run_script":
        script.run(globals())
        update_policy_displays(which="both")
    if cmd == "show_golden_path":
        Vis.show_golden_path()