def run_Agent(param): '''Run the agent for several transitions, depending on the value of param. It uses the policy from VI. Return True if more turns can still be taken.''' global Agent_state, Terminal_state for i in range(param): if Agent_state == Terminal_state: print("Terminal state reached!") return False a = VI.apply_policy(Agent_state) Agent_turn(a) return True
def run_QL_agent(param, action=None): '''Return True if more turns can still be taken.''' global TERMINATED #print("In run_QL_agent, action = "+action) global Agent_state, Terminal_state for i in range(param): if Agent_state == Terminal_state: print("Terminal state reached!") TERMINATED = True return False if action: a = action else: a = VI.apply_policy( Agent_state) # Should prob. use a different policy method. Agent_turn(a) #print("Need to perform a Q update here.") return True
def MDP_command(cmd, param): global GAMMA, ALL_STATES, CLOSED global ACTIONS, NOISE, LIVING_REWARD, NGOALS, SILVER_PATH, N_disks global V_from_VI, Q_from_VI, V_from_QL, Q_from_QL, POLICY_from_VI, POLICY_from_QL global Agent_state, n_iterations, NEED_Q_LEARN_SETUP, LAST_REWARD, TERMINATED, Terminal_state global ALPHA, EPSILON, QUIET_MODE #print("In MDP_command, cmd = "+cmd+"; param = "+str(param)) if cmd == "NDISKS": N_disks = param TowersOfHanoi.N_disks = param try: Vis.unhighlight(Agent_state) except: pass set_up_state_space() return if cmd == "noise": NOISE = param if cmd == "ngoals": NGOALS = param if NGOALS == 2: SILVER_PATH = make_solution_path(path_type="silver") else: SILVER_PATH = [] if cmd == "living_reward": LIVING_REWARD = param if cmd == "set_gamma": GAMMA = param update_qlearn_params() return if cmd == "show_values": if param == 1: Vis.display_values(V_from_VI) #for s in V_from_VI.keys(): Vis.reshow_state(s,V_from_VI[s]) if param == 2: Vis.show_q_values(Q_from_VI, CLOSED) #Vis.reshow_all_q_values(Q_from_VI, CLOSED) if param == 3: compute_V_from_QL() Vis.display_values(V_from_QL) #for s in V_from_QL.keys(): Vis.reshow_state(s,V_from_QL[s]) if param == 4: Vis.show_q_values(Q_from_QL, CLOSED) #Vis.reshow_all_q_values(Q_from_QL, CLOSED) return if cmd == "Value_Iteration": if param == 0: # Reset VI state values to 0. n_iterations = 0 initialize_V_from_VI(0) init_q_values(Q_from_VI) if Vis.DISPLAY_VALS_VAR.get() == 1: Vis.display_values(V_from_VI) elif Vis.DISPLAY_VALS_VAR.get() == 2: Vis.show_q_values(Q_from_VI, CLOSED) Vis.enable_value_iteration(True) Vis.enable_vi_action_menu_items(False) update_policy_displays(which="VI") return if param == 1: (V_from_VI, max_delta) = VI.one_step_of_VI(ALL_STATES, ACTIONS, T, R, GAMMA, V_from_VI.copy()) n_iterations += 1 print("After " + str(n_iterations) + " iterations, max_delta = " + str(max_delta)) Vis.enable_policy_extraction(True) Q_from_VI = VI.return_Q_values(CLOSED, ACTIONS) update_policy_displays(which="VI") if param > 1: for i in range(param): (V_from_VI, max_delta) = VI.one_step_of_VI(ALL_STATES, ACTIONS, T, R, GAMMA, V_from_VI.copy()) n_iterations += 1 print("After " + str(n_iterations) + " iterations, max_delta = " + str(max_delta)) if max_delta < 0.00000001: print("VI has converged after iteration " + str(n_iterations) + ".") break Vis.enable_policy_extraction(True) Q_from_VI = VI.return_Q_values(CLOSED, ACTIONS) update_policy_displays(which="VI") # Update the display of values or q-values, whichever is enabled currently. mode = Vis.DISPLAY_VALS_VAR.get() if mode == 1: for s in V_from_VI.keys(): Vis.reshow_state(s, V_from_VI[s]) if mode == 2: Vis.show_q_values(Q_from_VI, CLOSED) return if cmd == "Show_Policy_from_VI": # THIS CMD SHOULD ACTUALLY BE UNNECESSARY NOW. update_policy_displays(which="VI") if cmd == "Show_Policy_from_QL": # THIS CMD SHOULD ACTUALLY BE UNNECESSARY NOW. update_policy_displays(which="QL") if cmd == "Agent": if param == 0 or Agent_state == Terminal_state: Vis.unhighlight(Agent_state) Agent_turn(ACTIONS[0], reset=True) initialize_episode() elif param == 1: a = VI.apply_policy(Agent_state) Agent_turn(a) else: Vis.TK_Canvas.after(10, lambda: run_Agent(param)) if cmd == "QLearn": init_Q_Learn_if_needed() if param == -1 or Agent_state == Terminal_state: # Reset the agent to s0, ready for a new episode. Vis.unhighlight(Agent_state) Agent_turn(ACTIONS[0], reset=True) initialize_episode() elif param == -2: # Reset all state and Q values to 0. init_q_values(Q_from_QL, QL=True) initialize_V_from_QL(0) #Vis.reshow_all_q_values(Q_from_QL, CLOSED) if Vis.DISPLAY_VALS_VAR.get() == 3: compute_V_from_QL() Vis.display_values(V_from_QL) return if Vis.DISPLAY_VALS_VAR.get() == 4: Vis.show_q_values(Q_from_QL, CLOSED) update_policy_displays(which="QL") return elif param == 0: user_drives_agent_via_text_input() # elif param==1: # a = Q_Learn.choose_next_action(Agent_state, LAST_REWARD, TERMINATED) # Agent_turn(a) # increment_transition_count() elif param > 0: # Perform up to n transitions of Q learning. for i in range(param): a = Q_Learn.choose_next_action(Agent_state, LAST_REWARD, TERMINATED) Agent_turn(a) if TERMINATED: # Make one more call to the Q_Learn agent so it can do a q-update based # on the reward in going from a goal state to the Terminal_state. # The returned "action" a should be None, but probably does not matter. a = Q_Learn.choose_next_action(Agent_state, LAST_REWARD, TERMINATED) print("Sent final reward for this episode: R=" + str(LAST_REWARD)) print("Episode ended after transition " + str(get_transition_count())) increment_episode_count() print( str(get_episode_count()) + " episodes so far in this Q-learning run.") TERMINATED = False # Make it easier to start the next set of transitions. break increment_transition_count() update_policy_displays(which="QL") elif param == -1000: # Do 1000 transitions as quickly as possible, using as many episodes # as needed. train_quietly(1000) update_policy_displays(which="QL") return if cmd == "Exploration": if Vis.EXPL_VAR.get(): init_q_values(Q_from_QL) mode = Vis.DISPLAY_VALS_VAR.get() if mode == 4: Vis.reshow_all_q_values(Q_from_QL) Q_Learn.setup(ALL_STATES, ACTIONS, Q_from_QL, update_q_value, is_valid_goal_state, Terminal_state, use_exp_fn=True) update_policy_displays(which="QL") if cmd == "alpha": if param == 1: ALPHA = 0.1 elif param == 2: ALPHA = 0.2 elif param == 3: ALPHA = -1 update_qlearn_params() return if cmd == "epsilon": if param == 1: EPSILON = 0.1 elif param == 2: EPSILON = 0.2 elif param == 3: EPSILON = -1 update_qlearn_params() return if cmd == "User_chose": init_Q_Learn_if_needed() a = param Agent_turn(a) increment_transition_count() Q_Learn.handle_transition(a, Agent_state, LAST_REWARD) update_policy_displays(which="QL") if cmd == "Get_Q_Values": return ((ALL_STATES, Q_VALUES) ) # Needs updating to refer to one of the types of Q values. if cmd == "compare": #Compare_QLearn_to_VI.receive_globals(globals()) #Q_from_VI = VI.return_Q_values(CLOSED, ACTIONS) compute_V_from_QL() Compare_QLearn_to_VI.full_compare() if cmd == "Run_script": script.run(globals()) update_policy_displays(which="both") if cmd == "show_golden_path": Vis.show_golden_path()