def test_detach_inconsistent_states(abstr_type): mdp = GridWorldMDP() abstr_mdp = mdp.make_abstr_mdp(abstr_type) agent = AbstractionAgent(mdp, s_a=abstr_mdp.state_abstr) for i in range(1000000): if i % 1000 == 0: print('On step', i) agent.explore() error_states = agent.detach_inconsistent_states(verbose=True)
def test_get_ground_states_from_abstact_state(): mdp = GridWorldMDP() abstr_mdp = mdp.make_abstr_mdp(Abstr_type.PI_STAR) agent = AbstractionAgent(mdp, s_a=abstr_mdp.state_abstr) for value in agent.s_a.abstr_dict.values(): print(value, end= ' ') ground_states = agent.get_ground_states_from_abstract_state(value) for state in ground_states: print(state, end = ' ') print()
def test_check_abstract_state_consistency(): mdp = GridWorldMDP() abstr_mdp = mdp.make_abstr_mdp(Abstr_type.A_STAR) agent = AbstractionAgent(mdp, s_a = abstr_mdp.state_abstr) for i in range(100000): if i % 1000 == 0: print('On step', i) agent.explore() # Get all abstract states abstr_states = [] for value in agent.s_a.abstr_dict.values(): abstr_states.append(value) abstr_states = agent.get_abstract_states() for abstr_state in abstr_states: agent.check_abstract_state_consistency(abstr_state, verbose=True)
def test_rollout_adjustment(key): """ Train the agent on a state abstraction with fatal errors. Then generate a roll-out, detach the first state that's part of a cycle, and restart learning. """ # Load a poorly-performing abstraction names = ['AbstrType', 'AbstrEps', 'CorrType', 'CorrProp', 'Batch', 'Dict'] df = pd.read_csv('../abstr_exp/corrupted/corrupted_abstractions.csv', names=names) abstr_string = df.loc[(df['AbstrType'] == str(key[0])) & (df['AbstrEps'] == key[1]) & (df['CorrType'] == str(key[2])) & (df['CorrProp'] == key[3]) & (df['Batch'] == key[4])]['Dict'].values[0] abstr_list = ast.literal_eval(abstr_string) abstr_dict = {} for el in abstr_list: is_term = el[0][0] == 11 and el[0][1] == 11 state = GridWorldState(el[0][0], el[0][1], is_terminal=is_term) abstr_dict[state] = el[1] # Create an agent with this abstraction s_a = StateAbstraction(abstr_dict, abstr_type=Abstr_type.PI_STAR) mdp = GridWorldMDP() agent = AbstractionAgent(mdp, s_a=s_a) # This is useful for later agent2 = copy.deepcopy(agent) # Generate a roll-out from a trained agent after 10000 steps for i in range(5000): agent.explore() rollout = agent.generate_rollout() print('Roll-out for model with no adjustment, 5,000 steps') for state in rollout: print(state, end=', ') for i in range(5000): agent.explore() rollout = agent.generate_rollout() print('Roll-out for model with no adjustment, 10,000 steps') for state in rollout: print(state, end=', ') print('\n') # Train an agent for 5000 steps, detach the first state in the cycle, and train for another 5000 steps # The hope is that this will get further than the 10000 step one for i in range(5000): agent2.explore() rollout = agent2.generate_rollout() print('Roll-out for model pre-adjustment, 5,000 steps') for state in rollout: print(state, end=', ') print() print('Detaching state', rollout[-1]) agent2.detach_state(rollout[-1]) for i in range(5000): agent2.explore() rollout = agent2.generate_rollout() print('Roll-out for model post-adjustment, 10,000 steps') for state in rollout: print(state, end=', ')
def test_gridworld(): mdp = GridWorldMDP() mdp_str = 'rooms' #mdp = TaxiMDP() #mdp_str = 'taxi' eps = 0.0 abstr_epsilon_list = [(Abstr_type.Q_STAR, eps), (Abstr_type.A_STAR, eps), (Abstr_type.PI_STAR, eps)] #abstr_epsilon_list = [(Abstr_type.A_STAR, eps), (Abstr_type.PI_STAR, eps)] exp = Experiment(mdp, num_agents=20, abstr_epsilon_list=abstr_epsilon_list) # Testing that one agent in an ensemble acting on its MDP won't affect another agent ''' print(exp) for agent in exp.agents['ground']: print(agent.mdp) print() for i in range(100): exp.agents['ground'][0].explore() for agent in exp.agents['ground']: print(agent.mdp) ''' # Testing run_trajectory ''' for i in range(20): actual, optimal = exp.run_trajectory(exp.agents['ground'][0]) print(actual, optimal) print('\n\n\n') for i in range(20): actual, optimal = exp.run_trajectory(exp.agents[(Abstr_type.PI_STAR, 0.0)][0]) print(actual, optimal) ''' # Testing run_ensemble #print(exp.run_ensemble(exp.agents[(Abstr_type.Q_STAR, 0.0)])) # Testing writing to file data, steps = exp.run_all_ensembles(num_episodes=500) # Testing plotting results exp.visualize_results( data, 'results/exp_graph_' + mdp_str + '_' + str(eps) + '.png') exp.visualize_results( steps, 'results/step_counts_' + mdp_str + '_' + str(eps) + '.png')
def test_check_for_optimal_action_and_value(states, num_steps): """ Create a list of actions generated by following the greedy policy, starting at the given state """ mdp = GridWorldMDP() abstr_mdp = mdp.make_abstr_mdp(Abstr_type.Q_STAR) agent = AbstractionAgent(mdp, s_a=abstr_mdp.state_abstr) for i in range(100000): if i % 1000 == 0: print('On step', i) agent.explore() # print(agent.get_learned_policy_as_string()) policy = agent.get_learned_policy() #for key, value in agent.get_learned_policy_as_string().items(): # print(key, value, agent.get_q_value(key[0], key[1])) for s in agent.mdp.get_all_possible_states(): #for a in agent.mdp.actions: print(s, agent.get_best_action_value(s)) for state in states: mdp_state = GridWorldState(state[0], state[1]) action, value = agent.check_for_optimal_action_value_next_state(mdp_state, verbose=True) print()
def test_fourrooms(abstr_type, noise=0.0): """ Test the corruption of a Q* abstraction in fourrooms :param abstr_type: the type of abstraction to be tested :param noise: the proportion of states to be scrambled :return: """ # Make a grid world MDP and create an abstraction of the given type from it mdp = GridWorldMDP() vi = ValueIteration(mdp) vi.run_value_iteration() q_table = vi.get_q_table() true_abstr = make_abstr(q_table, abstr_type=abstr_type) # Corrupt the true results corrupt_results = uniform_random(true_abstr, proportion=0.1) true_dict = true_abstr.get_abstr_dict() corrupt_dict = corrupt_results.get_abstr_dict() for key in true_dict.keys(): if true_dict[key] != corrupt_dict[key]: print(key, true_dict[key], corrupt_dict[key])
def main(): # Testing what a Q* abstraction looks like in # four rooms # Make MDP and train an agent in it grid_mdp = GridWorldMDP(height=9, width=9, slip_prob=0.0, gamma=0.99) agent = Agent(grid_mdp) # Train the agent for 10000 steps trajectory = [] for i in range(100000): if i % 1000 == 0: print("epsilon, alpha:", agent._epsilon, agent._alpha) current_state, action, next_state, _ = agent.explore() trajectory.append(current_state) already_printed = [] for state in trajectory: if state not in already_printed: already_printed.append(state) # Print the action values learned at each state for state in already_printed: print("values learned at state", state) print_action_values(agent.get_action_values(state)) print() # Make an abstraction from the agent's q-table state_abstr = make_abstr(agent.get_q_table(), Abstr_type.Q_STAR, epsilon=0.05) print(state_abstr) # Testing that Pi* abstraction works ''' # Create toy q_table to build abstraction from q_table = {(GridWorldState(1,1), Dir.UP): 0.9, (GridWorldState(1,1), Dir.DOWN): 0.8, (GridWorldState(1,1), Dir.LEFT): 0.7, (GridWorldState(1,1), Dir.RIGHT): 0.6, # Same optimal action and action value as (1,1) (GridWorldState(1,2), Dir.UP): 0.9, (GridWorldState(1,2), Dir.DOWN): 0.0, (GridWorldState(1,2), Dir.LEFT): 0.2, (GridWorldState(1,2), Dir.RIGHT): 0.5, # val(UP) = 0.9 but val(DOWN) = 0.91 (GridWorldState(2,2), Dir.UP): 0.9, (GridWorldState(2,2), Dir.DOWN): 0.91, (GridWorldState(2,2), Dir.LEFT): 0.8, (GridWorldState(2,2), Dir.RIGHT): 0.9, # val(UP) = 0.89, max val (GridWorldState(2,1), Dir.UP): 0.9, (GridWorldState(2,1), Dir.DOWN): 0.9, (GridWorldState(2,1), Dir.LEFT): 0.90000000001, (GridWorldState(2,1), Dir.RIGHT): 0.7, # val(UP) = 0.93, max val (GridWorldState(3,1), Dir.UP): 1000, (GridWorldState(3,1), Dir.DOWN): 0.89, (GridWorldState(3,1), Dir.LEFT): 0.89, (GridWorldState(3,1), Dir.RIGHT): 0.89} state_abstr = make_abstr(q_table, Abstr_type.PI_STAR) print("(1,1), (1,2), and (3,1) should all get mapped together") print(state_abstr) ''' # Testing that A* abstraction works ''' # Create toy q_table to build abstraction from # Optimal action/val is UP/0.9 q_table = {(GridWorldState(1,1), Dir.UP): 0.9, (GridWorldState(1,1), Dir.DOWN): 0.8, (GridWorldState(1,1), Dir.LEFT): 0.7, (GridWorldState(1,1), Dir.RIGHT): 0.6, # Same optimal action and action value as (1,1) (GridWorldState(1,2), Dir.UP): 0.9, (GridWorldState(1,2), Dir.DOWN): 0.0, (GridWorldState(1,2), Dir.LEFT): 0.2, (GridWorldState(1,2), Dir.RIGHT): 0.5, # val(UP) = 0.9 but val(DOWN) = 0.91 (GridWorldState(2,2), Dir.UP): 0.9, (GridWorldState(2,2), Dir.DOWN): 0.91, (GridWorldState(2,2), Dir.LEFT): 0.8, (GridWorldState(2,2), Dir.RIGHT): 0.9, # val(UP) = 0.89, max val (GridWorldState(2,1), Dir.UP): 0.89, (GridWorldState(2,1), Dir.DOWN): 0.88, (GridWorldState(2,1), Dir.LEFT): 0.8, (GridWorldState(2,1), Dir.RIGHT): 0.7, # val(UP) = 0.93, max val (GridWorldState(3,1), Dir.UP): 0.93, (GridWorldState(3,1), Dir.DOWN): 0.89, (GridWorldState(3,1), Dir.LEFT): 0.89, (GridWorldState(3,1), Dir.RIGHT): 0.89} state_abstr = make_abstr(q_table, Abstr_type.A_STAR) print("Epsilon = 0. (1,1) and (1,2) should be mapped together") print(state_abstr) state_abstr = make_abstr(q_table, Abstr_type.A_STAR, epsilon=0.015) print("Epsilon = 0.015. (1,1), (1,2), and (2,1) should all be mapped together") print(state_abstr) state_abstr = make_abstr(q_table, Abstr_type.A_STAR, epsilon=0.031) print("Epsilon = 0.031. (1,1), (1,2), (2,1), (3,1) should all be mapped together") print(state_abstr) ''' # Testing that Q* abstraction function works ''' # Create toy q_table to build the abstraction from q_table = {(GridWorldState(1,1), Dir.UP): 1.0, (GridWorldState(1,1), Dir.DOWN): 2.5, (GridWorldState(1,1), Dir.LEFT): 2.3, (GridWorldState(1,1), Dir.RIGHT): 5.0, (GridWorldState(2,1), Dir.UP): 1.0, (GridWorldState(2,1), Dir.DOWN): 2.5, (GridWorldState(2,1), Dir.LEFT): 2.3, (GridWorldState(2,1), Dir.RIGHT): 5.05, (GridWorldState(2,2), Dir.UP): 1.1, (GridWorldState(2,2), Dir.DOWN): 2.4, (GridWorldState(2,2), Dir.LEFT): 2.3, (GridWorldState(2,2), Dir.RIGHT): 4.8, (GridWorldState(1,2), Dir.UP): 1.3, (GridWorldState(1,2), Dir.DOWN): 2.0, (GridWorldState(1,2), Dir.LEFT): 2.0, (GridWorldState(1,2), Dir.RIGHT): 4.8 } state_abstr = make_abstr(q_table, Abstr_type.Q_STAR) print("Epsilon = 0. No shapes should be mapped together.") print(str(state_abstr)) state_abstr = make_abstr(q_table, Abstr_type.Q_STAR, epsilon=0.3) print("Epsilon = 0.3. (1,1), (2,1), (2,2) should all be mapped together") print(str(state_abstr)) state_abstr = make_abstr(q_table, Abstr_type.Q_STAR, epsilon=0.1) print("Epsilon = 0.1. (1,1), (2,1) should be mapped together. (2,2) should not.") print(str(state_abstr)) state_abstr = make_abstr(q_table, Abstr_type.Q_STAR, epsilon=0.5) print("Epsilon = 0.5. (1,1), (2,1), (1,2), (2,2) should all be mapped together") print(str(state_abstr)) ''' # Testing Q-learning in abstract Four Rooms ''' # Map all the states in the bottom-right room to the same abstract state abstr_dict = {} for i in range(6,12): for j in range(1,6): abstr_dict[GridWorldState(i,j)] = 'oneroom' state_abstr = StateAbstraction(abstr_dict) abstr_mdp = AbstractGridWorldMDP(height=11, width=11, slip_prob=0.0, gamma=0.95, build_walls=True, state_abstr=state_abstr) agent = Agent(abstr_mdp) trajectory = [] for i in range(100000): #print("At step", i) #print("parameters are", agent._alpha, agent.mdp.gamma) current_state, action, next_state, _ = agent.explore() #print("At", str(current_state), "took action", action, "got to", str(next_state)) #print("Values learned for", str(current_state), "is") #print_action_values(agent.get_action_values(current_state)) trajectory.append(current_state) #print() already_printed = [] for state in trajectory: if state not in already_printed: print("values learned at state", state) print_action_values(agent.get_action_values(state)) already_printed.append(state) agent.reset_to_init() for i in range(25): current_state, action, next_state = agent.apply_best_action() print('At', str(current_state), 'taking action', str(action), 'now at', str(next_state)) ''' # Testing Q-learning in toy abstract MDP ''' # Simple abstraction in a grid where all states above the start-to-goal # diagonal are grouped together and all states below that diagonal # are grouped together toy_abstr = StateAbstraction({GridWorldState(2,1): 'up', GridWorldState(3,1): 'up', GridWorldState(3,2): 'up', GridWorldState(4,1): 'up', GridWorldState(4,2): 'up', GridWorldState(4,3): 'up', GridWorldState(5,1): 'up', GridWorldState(5,2): 'up', GridWorldState(5,3): 'up', GridWorldState(5,4): 'up', GridWorldState(1,2): 'right', GridWorldState(1,3): 'right', GridWorldState(1,4): 'right', GridWorldState(1,5): 'right', GridWorldState(2,3): 'right', GridWorldState(2,4): 'right', GridWorldState(2,5): 'right', GridWorldState(3,4): 'right', GridWorldState(3,5): 'right', GridWorldState(4,5): 'right'}) #print("states covered by abstraction are", toy_abstr.abstr_dict.keys()) abstr_mdp = AbstractGridWorldMDP(height=5, width=5, slip_prob=0.0, gamma=0.95, build_walls=False, state_abstr=toy_abstr) #print(abstr_mdp.state_abstr.get_abstr_from_ground(GridWorldState(1,1))) agent = Agent(abstr_mdp) trajectory = [] for i in range(10000): #print("At step", i) #print("parameters are", agent._alpha, agent.mdp.gamma) current_state, action, next_state, _ = agent.explore() #print("At", str(current_state), "took action", action, "got to", str(next_state)) #print("Values learned for", str(current_state), "is") #print_action_values(agent.get_action_values(current_state)) trajectory.append(current_state) #print() already_printed = [] for state in trajectory: if state not in already_printed: print("values learned at state", state) print_action_values(agent.get_action_values(state)) already_printed.append(state) ''' # Testing both epsilon-greedy and application of best learned # policy in ground MDP ''' grid_mdp = GridWorldMDP(height=9, width=9, slip_prob=0.0, gamma=0.95, build_walls=True) agent = Agent(grid_mdp) #agent.set_current_state(GridWorldState(1,1)) print(grid_mdp.goal_location) # Testing if epsilon-greedy policy works properly trajectory = [] for i in range(10000): #print("At step", i) #print("parameters are", agent._alpha, agent.mdp.gamma) current_state, action, next_state, _ = agent.explore() #print("At", str(current_state), "took action", action, "got to", str(next_state)) #print("Values learned for", str(current_state), "is") #print_action_values(agent.get_action_values(current_state)) trajectory.append(current_state) #print() #print("Went through the following states:") #for state in trajectory: # print(str(state)) already_printed = [] for state in trajectory: if state not in already_printed: print("values learned at state", state) print_action_values(agent.get_action_values(state)) already_printed.append(state) #print(grid_mdp.walls) agent.reset_to_init() for i in range(25): current_state, action, next_state = agent.apply_best_action() print('At', str(current_state), 'taking action', str(action), 'now at', str(next_state)) ''' # Testing a few trajectories to make sure the q-table updates # properly ''' test_trajectory = [Dir.UP, Dir.RIGHT, Dir.UP, Dir.RIGHT] for i in range(5): apply_trajectory(agent, test_trajectory) agent.set_current_state(GridWorldState(9,9)) test_trajectory = [Dir.RIGHT, Dir.RIGHT, Dir.UP, Dir.UP] apply_trajectory(agent, test_trajectory) agent.set_current_state(GridWorldState(9,9)) test_trajectory = [Dir.UP, Dir.UP, Dir.RIGHT, Dir.RIGHT] apply_trajectory(agent, test_trajectory) ''' # Testing motion, reward at goal state, and reset to # initial state at terminal state ''' agent = Agent(grid_mdp, go_up_right) for i in range(30): agent.act() print(grid_mdp.walls) ''' # Testing getter for best action/value given state ''' agent = Agent(grid_mdp, go_right, alpha=0.5) current_state = agent.get_current_state() test_action = Dir.UP # Set q_value for init_state, Dir.UP = 1.0 agent._set_q_value(current_state, test_action, 1.0) # Should give Dir.UP, 1.0 print("should give (Dir.UP, 1.0)", agent.get_best_action_value_pair(current_state)) # Go right by one agent.act() print("Currently at", agent.get_current_state()) # Should give random action with value = 0 print("Should give (random_action, 0.0)", agent.get_best_action_value_pair(agent.get_current_state())) # Update q-values of this state agent._set_q_value(agent.get_current_state(), Dir.UP, -1.0) agent._set_q_value(agent.get_current_state(), Dir.DOWN, -1.0) agent._set_q_value(agent.get_current_state(), Dir.LEFT, -1.0) agent._set_q_value(agent.get_current_state(), Dir.RIGHT, 0.1) # Should give Dir.RIGHT, 0.1 print("Should give (Dir.RIGHT, 0.1)", agent.get_best_action_value_pair(agent.get_current_state())) print() # Checking that all values were updated properly for action in agent.mdp.actions: print("action:q-value = ", action, ":", agent.get_q_value(agent.get_current_state(), action)) ''' # Testing single instance of the act, update flow # Start agent at (10,11), go one right, get reward, # check that update happened '''
from GridWorld.GridWorldMDPClass import GridWorldMDP from MDP.StateAbstractionClass import StateAbstraction from MDP.AbstractMDPClass import AbstractMDP from MDP.ValueIterationClass import ValueIteration from resources.AbstractionTypes import Abstr_type from resources.AbstractionCorrupters import make_corruption from resources.AbstractionMakers import make_abstr import numpy as np # Number of states to corrupt STATE_NUM = 20 # Create abstract MDP mdp = GridWorldMDP() vi = ValueIteration(mdp) vi.run_value_iteration() q_table = vi.get_q_table() state_abstr = make_abstr(q_table, Abstr_type.PI_STAR) abstr_mdp = AbstractMDP(mdp, state_abstr) # Randomly select our list of states and print them out states_to_corrupt = np.random.choice(mdp.get_all_possible_states(), size=STATE_NUM, replace=False) for state in states_to_corrupt: print(state) # Create a corrupt MDP corr_mdp = make_corruption(abstr_mdp, states_to_corrupt)
def test_agent_abstraction(): # Make agent and MDP mdp = GridWorldMDP() agent = Agent(mdp) # Run for some number of steps while agent._episode_counter < EP_COUNT: agent.explore() # Make a new abstraction based on the learned q-table num_abstr_states, num_reduced_ground_states = agent.make_abstraction( Abstr_type.PI_STAR, epsilon=THRESHOLD, ignore_zeroes=IGNORE_ZEROES) agent._epsilon = agent._epsilon / PARAM_CUT # Count the number of abstract states so we can see how much this is changing key_count = 0 ground_key_count = 0 for key in agent._q_table.keys(): key_count += 1 #if isinstance(key[0], TaxiState): if isinstance(key[0], GridWorldState): ground_key_count += 1 #print(key_count - ground_key_count) # Print the state abstraction so we can see what's up #for key, value in agent.mdp.state_abstr.abstr_dict.items(): # print(key, value) #for key, value in agent._q_table.items(): # print(key[0], key[1], value) # Run for an episode to see what the reward is curr_state = agent.get_current_state() cumu_reward = 0 discount = 1 while not curr_state.is_terminal(): state, action, next_state, reward = agent.explore() cumu_reward += reward * discount discount *= agent.mdp.gamma curr_state = next_state #print("Test", cumu_reward, end=' ') # Control agent, no abstraction performed mdp_control = GridWorldMDP() agent_control = Agent(mdp_control) # Train the agent for the same number of episodes curr_state = agent_control.mdp.get_current_state() while agent_control._episode_counter < EP_COUNT: agent_control.explore() # Run control agent for an episode to see what the reward is curr_state = agent_control.get_current_state() control_cumu_reward = 0 discount = 1 while not curr_state.is_terminal(): state, action, next_state, reward = agent_control.explore() control_cumu_reward += reward * discount discount *= agent_control.mdp.gamma curr_state = next_state #print("Control", cumu_reward) #print("Delta", cumu_reward - control_cumu_reward) return cumu_reward, control_cumu_reward, key_count - ground_key_count, num_reduced_ground_states - num_abstr_states
test_udm(mdp, Abstr_type.Q_STAR, EPISODE_COUNT, error_dict=mild_error_1) quit() ''' # A-star with mild error 1 ''' test_udm(mdp, Abstr_type.A_STAR, EPISODE_COUNT, error_dict=mild_error_1) ''' # Pi-star with mild error 1 ''' test_udm(mdp, Abstr_type.PI_STAR, EPISODE_COUNT, error_dict=mild_error_1) ''' # Large MDP mdp = GridWorldMDP(goal_location=[(7, 11)]) # Large bad error large_bad_error = { GridWorldState(2, 1): GridWorldState(8, 11), GridWorldState(1, 2): GridWorldState(7, 10) } # Test Q-Star with large MDP, bad error test_udm(mdp, Abstr_type.Q_STAR, EPISODE_COUNT, error_dict=large_bad_error) """ # Run the split test 5 times on a true Q-star abstraction to see how much it splits for i in range(NUM_TESTS): mdp = TwoRoomsMDP(lower_width=3, lower_height=3, upper_width=3,
def iterate_detachment(mdp_key, batch_size=5000): """ Load an incorrect abstraction. Train the model, generate a roll-out, detach the first cycle state. Repeat until the roll-out achieves a terminal state. Save the adjusted abstraction and learned policy. Visualize the original incorrect abstraction with roll-outs from original agents and the adjusted abstraction with a roll-out from the new agent :param key: key for incorrect (poorly performing) abstraction :param batch_size: Number of steps to train between state detachments """ # Load a poorly-performing abstraction names = ['AbstrType', 'AbstrEps', 'CorrType', 'CorrProp', 'Batch', 'Dict'] df = pd.read_csv('../abstr_exp/corrupted/corrupted_abstractions.csv', names=names) abstr_string = df.loc[(df['AbstrType'] == str(mdp_key[0])) & (df['AbstrEps'] == mdp_key[1]) & (df['CorrType'] == str(mdp_key[2])) & (df['CorrProp'] == mdp_key[3]) & (df['Batch'] == mdp_key[4])]['Dict'].values[0] abstr_list = ast.literal_eval(abstr_string) abstr_dict = {} for el in abstr_list: is_term = el[0][0] == 11 and el[0][1] == 11 state = GridWorldState(el[0][0], el[0][1], is_terminal=is_term) abstr_dict[state] = el[1] # Create an agent with this abstraction s_a = StateAbstraction(abstr_dict, abstr_type=Abstr_type.PI_STAR) mdp = GridWorldMDP() agent = AbstractionAgent(mdp, s_a=s_a) # Generate a roll-out from untrained model (should be random and short) rollout = agent.generate_rollout() print('Roll-out from untrained model') for state in rollout: print(state, end=', ') print() # Until roll-out leads to terminal state, explore and detach last state of roll-out. Record each of the detached # states so they can be visualized later detached_states = [] step_counter = 0 while not rollout[-1].is_terminal(): for i in range(batch_size): agent.explore() step_counter += batch_size rollout = agent.generate_rollout() print('Roll-out after', step_counter, 'steps') for state in rollout: print(state, end=', ') print() print('State Q-value pre-detach:') for action in agent.mdp.actions: print(rollout[-1], action, agent.get_q_value(rollout[-1], action)) detach_flag = agent.detach_state(rollout[-1]) if detach_flag == 0: print('Detaching state', rollout[-1]) detached_states.append(rollout[-1]) elif detach_flag == 1: print(rollout[-1], 'already a singleton state. No change.') print('State Q-value post-detach:') for action in agent.mdp.actions: print(rollout[-1], action, agent.get_q_value(rollout[-1], action)) print() for key, value in agent.get_q_table(): print(key, value) # Save resulting adapted state abstraction and learned policy s_a_file = open('../abstr_exp/adapted/adapted_abstraction.csv', 'w', newline='') s_a_writer = csv.writer(s_a_file) print(mdp_key) s_a_writer.writerow((mdp_key[0], mdp_key[1], mdp_key[2], mdp_key[3], mdp_key[4], agent.get_abstraction_as_string())) s_a_file.close() policy_file = open('../abstr_exp/adapted/learned_policy.csv', 'w', newline='') policy_writer = csv.writer(policy_file) policy_writer.writerow((mdp_key[0], mdp_key[1], mdp_key[2], mdp_key[3], mdp_key[4], agent.get_learned_policy_as_string())) policy_file.close() # Visualize the adapted state abstraction and learned policy, along with the original for comparison viz = GridWorldVisualizer() surface = viz.create_corruption_visualization(mdp_key, '../abstr_exp/adapted/adapted_abstraction.csv', error_file='../abstr_exp/corrupted/error_states.csv') # Draw small white circles over the states that were detached for state in detached_states: print(state, end=', ') #for d_state in viz.display_surface(surface)
from Experiment.ExperimentClass import Experiment from GridWorld.GridWorldMDPClass import GridWorldMDP from GridWorld.GridWorldStateClass import GridWorldState from GridWorld.TaxiMDPClass import TaxiMDP from GridWorld.LargeTaxiMDPClass import LargeTaxiMDP from GridWorld.TwoRoomsMDP import TwoRoomsMDP from Agent.AgentClass import Agent from resources.AbstractionTypes import Abstr_type from resources.AbstractionCorrupters import * from util import * from Visualizer.QValueVisualizer import QValueVisualizer import scipy.stats # MDP details MDP = GridWorldMDP() MDP = TaxiMDP(same_goal=True) #MDP = LargeTaxiMDP(same_goal=True, gamma=0.9) mdp_sum = 'Taxi MDP' ''' MDP = TwoRoomsMDP(upper_height=3, upper_width=3, lower_height=3, lower_width=3, hallway_states=[3], goal_location=[(1,5)]) MDP = TwoRoomsMDP(lower_width=1, lower_height=1, hallway_states=[1], upper_height=0, upper_width=0,
if dct[key] != 0.0: nonzero_count += 1 return nonzero_count def print_q_table(q_table): for key in q_table: print(key[0], key[1], q_table[key]) if __name__ == '__main__': # GridWorld # Make ground MDP mdp = GridWorldMDP(slip_prob=0.0) # Run VI to get q-table vi = ValueIteration(mdp) vi.run_value_iteration() q_table = vi.get_q_table() # Make state abstractions q_star_abstr = make_abstr(q_table, Abstr_type.Q_STAR) a_star_abstr = make_abstr(q_table, Abstr_type.A_STAR) pi_star_abstr = make_abstr(q_table, Abstr_type.PI_STAR) # Make abstract MDPs - NOTE THIS CLASS HAS BEEN DEPRECATED DO NOT USE q_mdp = AbstractGridWorldMDP(state_abstr=q_star_abstr) a_mdp = AbstractGridWorldMDP(state_abstr=a_star_abstr) pi_mdp = AbstractGridWorldMDP(state_abstr=pi_star_abstr) # This is the type of q2_mdp = AbstractMDP(mdp, state_abstr=q_star_abstr)
best_action_intersect = list( set(best_actions_1) & set(best_actions_2)) if len(best_action_intersect) == 0: return False return True def print_policy(policy): ''' Print the policy ''' for key in policy.keys(): print(key, policy[key]) if __name__ == '__main__': # Test that optimal ground policy for FourRooms is representable in # abstaction given by Q* # Get optimal ground policy for FourRooms four_rooms = GridWorldMDP(slip_prob=0.0, gamma=0.99) vi = ValueIteration(four_rooms) vi.run_value_iteration() optimal_policy = vi.get_optimal_policy() #print_policy(optimal_policy) # Get Q* abstraction for FourRooms and optimal abstract policy abstr = make_abstr(vi.get_q_table(), Abstr_type.A_STAR) print(is_optimal_policy_representable(vi, optimal_policy, abstr))
""" Test the error visualizer from QValueVisualizer """ from Visualizer.QValueVisualizer import QValueVisualizer from GridWorld.GridWorldMDPClass import GridWorldMDP if __name__ == '__main__': mdp = GridWorldMDP() v = QValueVisualizer(results_dir='../exp_output/big_test', states_to_track=mdp.get_all_possible_states()) v.visualize_q_value_error('noisy', mdp, episodes=[i for i in range(50, 1000, 50)])
# Create environment mdp = TwoRoomsMDP(lower_width=3, upper_width=3, lower_height=3, upper_height=3, hallway_states=[3], goal_location=[(1, 5)]) error_dict = { GridWorldState(1, 2): GridWorldState(2, 5), GridWorldState(3, 3): GridWorldState(1, 6) } ABSTR_TYPE = Abstr_type.Q_STAR ERROR_NUM = 6 mdp = GridWorldMDP() if ABSTR_TYPE == Abstr_type.Q_STAR: abstr_mdp = mdp.make_abstr_mdp(Abstr_type.Q_STAR) if ERROR_NUM == 1: error_dict = { GridWorldState(6, 3): GridWorldState(10, 9), GridWorldState(9, 10): GridWorldState(9, 3) } elif ERROR_NUM == 2: error_dict = { GridWorldState(9, 8): GridWorldState(2, 1), GridWorldState(9, 11): GridWorldState(2, 4) } # Lower right room all grouped together elif ERROR_NUM == 3: error_dict = {
# # print("epsilon, alpha:", agent._epsilon, agent._alpha) # # current_state, action, next_state, _ = agent.explore() # # state_abstr = make_abstr(agent.get_q_table(), Abstr_type.Q_STAR, epsilon=0.05) # # abstr_grid_mdp = AbstractGridWorldMDP(state_abstr=state_abstr) # # abs_agent = Agent(abstr_grid_mdp) # # abs_g_viz = AbstractGridWorldVisualizer(abstr_grid_mdp,abs_agent) # # #abs_g_viz.displayAbstractMDP() # # for i in range(100000): # # if i % 1000 == 0: # # print("epsilon, alpha:", abs_agent._epsilon, abs_agent._alpha) # # current_state, action, next_state,_ = abs_agent.explore() # # # # abs_g_viz.visualizeLearnedPolicy() #Q-STAR - USING VI mdp = GridWorldMDP(slip_prob=0, gamma=0.99) vi = ValueIteration(mdp) vi.run_value_iteration() q_table = vi.get_q_table() q_star_abstr = make_abstr(q_table, Abstr_type.Q_STAR, epsilon=0.01) abstr_grid_mdp = AbstractGridWorldMDP(state_abstr=q_star_abstr) abs_agent = Agent(abstr_grid_mdp) abs_g_viz = AbstractGridWorldVisualizer(abstr_grid_mdp, abs_agent) #abs_g_viz.displayAbstractMDP() for i in range(100000): if i % 1000 == 0: print("epsilon, alpha:", abs_agent._epsilon, abs_agent._alpha) current_state, action, next_state, _ = abs_agent.explore() abs_g_viz.visualizeLearnedPolicy() '''