def __init__(self, env, **kwargs): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here add_total = 0 add_total = False self.success = 0 self.total = 0 self.counter = 0 self.epsilon_reset_counter = 0 self.trial_counter = 0.0 self.min_epsilon = 0.001 self.eps_freq = 1.0 self.filled_cell_count = 0 self.total_cell_count = 0 self.updated_func_counter = 0 global stats_df_counter global stats_df for key, value in kwargs.iteritems(): print "%s = %s" % (key, value) if key == 'alp': self.alpha = value elif key == 'gma': self.gamma = value elif key == 'eps': self.epsl = value self.epsilon = self.epsl print "epsilon: ", self.epsilon self.qt = QTable(self.alpha, self.gamma) print '-' * 80
def test_argmax_without_init(self): """ Test max(key) function. """ qtable = QTable(self.actions) state = collections.OrderedDict() state['from'] = 1 state['to'] = 2 state['rank'] = [0, 1, 2] self.assertTrue(qtable.argmax(state) in self.actions)
def test_max_without_init(self): """ Test max(key) function. """ qtable = QTable(self.actions) state = collections.OrderedDict() state['from'] = 1 state['to'] = 2 state['rank'] = [0, 1, 2] self.assertEqual(self.default, qtable.max(state))
def test_argmax_with_init(self): """ Test max(key) function. """ qtable = QTable(self.actions) state = collections.OrderedDict() state['from'] = 1 state['to'] = 2 state['rank'] = [0, 1, 2] qtable[state][0] = 1.0 self.assertEqual(0, qtable.argmax(state))
def test_argmax_with_parity(self): """ Test max(key) function. """ qtable = QTable(self.actions) state = collections.OrderedDict() state['from'] = 1 state['to'] = 2 state['rank'] = [0, 1, 2] qtable[state][0] = 1.0 qtable[state][1] = 1.0 self.assertTrue(qtable.argmax(state) in [0, 1])
def main(): # step 1: loading the environment env = gym.make("FrozenLake-v0") # step 2: creating the Q-table state_size = env.observation_space.n action_size = env.action_space.n q = QTable(state_size, action_size) # step 3: creating de epsilon decay e = Epsilon(initial_epsilon=1.0, max_epsilon=1.0, min_epsilon=0.01, decay_rate=0.005) # step 4: Q-table training total_episodes = 100000 max_steps = 100 q, rewards = train_qtable(env, q, e, total_episodes, max_steps) print("Score over time {:.4f}".format(sum(rewards) / total_episodes)) q.print() # Play env.reset() rewards = [] for episode in range(1000): state = env.reset() step = 0 total_rewards = 0 for step in range(100): action = q.select_action(env, state) new_state, reward, done, info = env.step(action) total_rewards += reward state = new_state if done: break rewards.append(total_rewards) if episode % 100 == 0: print("******************************************") print("EPISODE {}".format(episode)) print("Number of steps: {}".format(step)) env.render() print("Score over time {:.4f}".format(sum(rewards) / 1000)) env.close()
def main(): # Step 1: create the Taxi-v2 environment env = gym.make("Taxi-v2") # Step 2: create the QTable q = QTable(env.observation_space.n, env.action_space.n, learning_rate=0.7, gamma=0.99) # Step 3: create the Epsilon decay e = Epsilon() # Step 4: Q-table training total_episodes = 100000 max_steps = 100 q, rewards = train_qtable(env, q, e, total_episodes, max_steps, verbose=True) print("Score over time {:.4f}".format(sum(rewards) / total_episodes)) q.print() env.render() # Play env.reset() rewards = [] for episode in range(1000): state = env.reset() step = 0 total_rewards = 0 for step in range(100): action = q.select_action(env, state) new_state, reward, done, info = env.step(action) total_rewards += reward state = new_state if done: break rewards.append(total_rewards) if episode % 100 == 0: print("******************************************") print("EPISODE {}".format(episode)) print("Number of steps: {}".format(step)) env.render() print("Score over time {:.4f}".format(sum(rewards) / 1000)) env.close()
class Agent: def __init__(self): self.qtable = QTable() def load_definitions(self, *defs): pass def train(self, env, epsilon=0.1, update_q_table=True): # RL training parameters alpha=0.1 gamma=0.6 steps = 0 while not env.state.done: # ------------------------------------ # Choose to explore or exploit # ------------------------------------ if np.random.uniform(0, 1) < epsilon: # Explore action space action = self.qtable.get_random_action(env.state) else: # Exploit the action space action = self.qtable.get_recommended_action(env.state) if PRINT_ACTIONS_TAKEN: print(action, "\n\n") old_state = env.state.get_copy() next_state, reward, done, to_undo = env.step(action) # will return error and undo, if unsuccessful # ------------------------------------ # Update the qtable # ------------------------------------ if update_q_table and not isinstance(action, Undo): self.qtable.update(old_state, next_state, action, reward, alpha, gamma) # ------------------------------------ # if it's an already visited state, you should undo it so the proof search goes faster # ------------------------------------ if to_undo: env.step(Undo()) steps += 1 print("The proof of", env.theorem.name) print("...took", steps, "steps.") print("Proof generated:", env.state.past_actions) def evaluate(self, env): self.train(env, epsilon=0, update_q_table=False) # only exploit, not explore def apply_antisymmetry(self): pass
def test_state_to_key(self): """ Test _state_to_key(key) function. """ qtable = QTable(self.actions) state1 = collections.OrderedDict() state1['from'] = 1 state1['to'] = 2 state1['rank'] = [0, 1, 2] state2 = collections.OrderedDict() state2['from'] = 1 state2['to'] = 2 state2['rank'] = [0, 2, 1] self.assertNotEqual(qtable._state_to_key(state1), qtable._state_to_key(state2))
def state_lookup(): w = World(5, 5, [(1, 1, 3), (2, 2, 4)], [(3, 3, 3), (4, 4, 4)], -1, 13, 13) q = QTable(w) a = Agent(0, 0) state = get_current_state(w, a) assert (q[state] == {'north': 0, 'south': 0, 'east': 0, 'west': 0})
def get_max_neighbors_test(): a = Agent(3, 2) w = World(5, 5, [(1, 1, 3), (2, 2, 4)], [(3, 3, 3), (4, 4, 4)], -1, 13, 13) q = QTable(w) q[get_current_state(w, a)]['south'] = 13 assert (get_max_neighbors(w.get_neighbors(*a.get_position()), get_current_state(w, a), q) == ['south'])
def manager(world, agent, algo, learning_rate, discount_rate, policy, num_steps, setup=None): """ This function is kinda like the main, it will run the given algorithm on the given world with the given learning rate, discount rate and policy Parameters: world (World) An instance of a World Object representing the world agent (Agent): An instance of a Agent Object representing the agent algo (function): A fuinction to call with the world, agent, qtable, learning rate, discount rate and policy as parameters that will decide where the agent should move and also update the qtable learning_rate (float) discount_rate (float) policy (string) PRANDOM, PEPLOIT or PGREEDY num_steps (int) how many steps to run for setup (list of tuples) this is an optional parameter it represents different policies that should be activated after a particular number of steps if this parameter is supplied to the function, it is expected to be a list of tupes, where each of the tuples is expected to be of this format: (integer, string) where the integer is the number of steps, and string is the policy to switch to after that many steps have been ran """ if not setup: setup = [] q = QTable(world._w, world._h)
def main(flags): ''' Runs an agent in an environment. params: flags (dict): configuration ''' env = gym.make('FrozenLake-v0') agent = QTable(env, gamma=flags.gamma, alpha=flags.learning_rate) trainer = Trainer(env, agent, flags) rewards, lengths = trainer.train(flags.num_episodes, flags.max_steps) plot_results(rewards, lengths)
def p_random_test(): agent = Agent(3, 3) world = World(5, 5, [(1, 1, 3)], [(1, 2, 3)], -1, 13, 13) q = QTable(world) assert (p_random(agent, world, q) in ["north", "south", "east", "west"]) world = World(5, 5, [(4, 3, 3)], [(1, 1, 3)], -1, 13, 13) assert (p_random(agent, world, q) == "east") agent.pick_up() agent._set_position(1, 2) assert (p_random(agent, world, q) == "north")
def test_set_without_init(self): """ Test qtable[state][action] = var if state not exists.""" qtable = QTable(self.actions) state = collections.OrderedDict() state['from'] = 1 state['to'] = 2 state['rank'] = [0, 2, 1] qtable[state][0] = 1.0 changed = { 0: 1.0, 1: self.default, 2: self.default, } self.assertEqual(changed, qtable[state])
def test_get_with_init(self): """ Test qtable[state] if state not exists.""" qtable = QTable(self.actions) state = collections.OrderedDict() state['from'] = 1 state['to'] = 2 state['rank'] = [0, 1, 2] wanted = { 0: self.default, 1: self.default, 2: self.default, } qtable[state] = wanted self.assertEqual(wanted, qtable[state])
def test_dill(self): """ Test the dillability of the class. """ qtable = QTable(self.actions) state1 = collections.OrderedDict() state1['from'] = 1 state1['to'] = 2 state1['rank'] = [0, 1, 2] state2 = collections.OrderedDict() state2['from'] = 1 state2['to'] = 2 state2['rank'] = [0, 2, 1] # create the states _ = qtable[state1] _ = qtable[state2] wanted = str(qtable) test = str(dill.loads(dill.dumps(qtable))) self.assertEqual(wanted, test)
def main(filename=None, time_to_run=5, probability_moving=0.8, constant_reward=-0.04): """ Main function for the program. :param filename: a txt file :param time_to_run: number of seconds to learn for :param probability_moving: probability of moving in the desired direction :param constant_reward: reward for moving (usually negative) :return: None """ # Initialize the lookup table qtable = QTable() board = None # read the board from the file with open(filename, 'r') as f: board = f.read().split('\n') for index in range(len(board)): board[index] = board[index].split("\t") board = [list(x) for x in board] for row in board: for element in range(len(row)): row[element] = int(row[element]) # Initialize the board board_object = Board(len(board), len(board[0]), board) # populate the lookup table with the movement reward as inital values board_object.populate_qtable(qtable, constant_reward) # Initialize the agent agent = Agent(qtable, board_object, time_to_run, probability_moving, constant_reward) # Run the agent (start learning) results = agent.run() # print the results print(results)
N_STATES = 6 ACTIONS = [0, 1] EPSILON = 0.9 ALPHA = 0.1 GAMMA = 0.9 MAX_EPISODES = 100 FRESH_TIME = 0.1 if __name__ == "__main__": env = gym.make('GoAhead-v0') rl1 = QTable( n_states=N_STATES, epsilon=EPSILON, gamma=GAMMA, alpha=ALPHA, actions=ACTIONS ) rl2 = DQN( n_states=1, n_actions=2, epsilon=EPSILON, gamma=GAMMA ) for i_episode in range(MAX_EPISODES): observation = env.reset() for t in range(100): env.render(fresh_time=FRESH_TIME)
def run(env, n_episodes, n_steps, initial_value=0, learning_rate=0.8, gamma=0.9, epsilon=0.1): env.reset() number_agents = len(env.agents) env.step(dict(zip(range(number_agents), [2] * number_agents))) my_env = LocalEnv(env.rail.grid, env.agents) initial_state = my_env.initial_state qtable = QTable(number_agents, initial_state, initial_value, gamma, learning_rate) lap_time = datetime.now() steps_per_episode = [] for episode in range(n_episodes): np.random.seed() my_env.restart_agents() current_state = my_env.get_current_state() if episode % 100 == 0: print('episode:', episode) print('in', datetime.now() - lap_time) lap_time = datetime.now() count = 0 for step in range(n_steps): if choose_at_random(epsilon): current_possible_actions = my_env.get_current_possible_actions( ) rand_index = np.random.choice(len(current_possible_actions)) action = current_possible_actions[rand_index] else: action = qtable.get_max_action(current_state) if number_agents == 1: reward, new_state, handles_done = my_env.step({0: action}) else: reward, new_state, handles_done = my_env.step( dict(zip(range(number_agents), action))) count += 1 if len(handles_done) == number_agents: break # if number_agents != 1 : # action = list(action) # for handle in handles_done: # action[handle] = 4 # action = tuple(action) # # print('') # print(current_state) # print(action) # print(new_state) qtable.update_table(current_state, action, new_state, reward) current_state = new_state steps_per_episode.append(count) return steps_per_episode, my_env, qtable
def __init__(self, env): self.env = env self.qtable = QTable(env.action_space)
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, **kwargs): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here add_total = 0 add_total = False self.success = 0 self.total = 0 self.counter = 0 self.epsilon_reset_counter = 0 self.trial_counter = 0.0 self.min_epsilon = 0.001 self.eps_freq = 1.0 self.filled_cell_count = 0 self.total_cell_count = 0 self.updated_func_counter = 0 global stats_df_counter global stats_df for key, value in kwargs.iteritems(): print "%s = %s" % (key, value) if key == 'alp': self.alpha = value elif key == 'gma': self.gamma = value elif key == 'eps': self.epsl = value self.epsilon = self.epsl print "epsilon: ", self.epsilon self.qt = QTable(self.alpha, self.gamma) print '-' * 80 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required totalTime = self.env.get_deadline(self) self.qt.printVal(totalTime) self.trial_counter += 1.0 if self.epsilon > self.min_epsilon: self.epsilon = (5.0 * self.epsl) / self.trial_counter self.eps_freq = math.ceil(1.0 / self.epsilon) print "self.epsilon:", self.epsilon, ", self.eps_freq: ", self.eps_freq, "\n" def update(self, t): global stats_df global stats_df_counter self.counter += 1 # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator current_state = self.env.sense(self) self.state = current_state deadline = self.env.get_deadline(self) # TODO: Update state # TODO: Select action according to your policy #action = random.choice([None, 'forward', 'left', 'right']) #if self.total > 0 and self.total % self.epsilon_freq == 0.0: # print "simulated annealing at ", self.total # action = random.choice([None, 'forward', 'left', 'right']) #else: if self.epsilon > self.min_epsilon and deadline != 0 and deadline != self.eps_freq and math.floor( deadline % self.eps_freq) == 0.0: #self.epsilon_reset_counter += 1 action = random.choice([None, 'forward', 'left', 'right']) print "annealing now.", "self.epsilon:", self.epsilon, ", action: ", action, ", deadline:", deadline else: #print "self.counter: ", self.counter, ", multiplier:", (self.counter * self.epsilon) action = self.qt.get_next_action(self.next_waypoint, deadline, current_state) # Execute action and get reward reward = self.env.act(self, action) add_total = False if deadline == 0: add_total = True if reward > 10: self.success += 1 add_total = True if add_total: self.total += 1 print("success: {} / {}".format(self.success, self.total)) if self.total == 100: for item, frame in self.qt.qtable.iteritems(): for item2, frame2 in frame.iteritems(): for item3, frame3 in frame2.iteritems(): for item4, frame4 in frame3.iteritems(): self.total_cell_count += 1 #print("f4:", frame4) if frame4 != 0.0: #print "\n" self.printNav(item2) self.printTraffic(item3) self.printTrafficLight(item4) self.printAction(item) print "Q-Val: {0:.5f}".format(frame4) self.filled_cell_count += 1 print '-' * 80 print "updated cells: ", self.filled_cell_count, ", self.total_cell_count:", self.total_cell_count, ", updated_func_counter:", self.updated_func_counter print "self.alpha:", self.alpha, "self.gamma:", self.gamma, ", self.epsilon:", self.epsl, ", success:", self.success, " in steps: ", deadline stats_df.loc[stats_df_counter] = [ self.alpha, self.gamma, self.epsl, self.success, deadline ] stats_df_counter += 1 print '_' * 80 # print '_'*20 # TODO: Learn policy based on state, action, reward next_state_value = self.env.sense(self) next_state_deadline = self.env.get_deadline(self) next_state_waypoint = self.planner.next_waypoint() self.qt.update(self.next_waypoint, deadline, current_state, action, reward, next_state_value, next_state_waypoint, self, self.env) self.updated_func_counter += 1 def printAction(self, code): print '|', if code == 'AN': print "Action: None", elif code == 'BF': print "Action: Forward", elif code == 'CR': print "Action: Right", elif code == 'DL': print "Action: Left", print '|', def printNav(self, code): print '|', if code == 0: print "Nav: None", elif code == 1: print "Nav: Forward", elif code == 2: print "Nav: Right", elif code == 3: print "Nav: Left", def printTraffic(self, code): left_mask = 0b000011 right_mask = 0b001100 oncoming_mask = 0b110000 left_filtered = code & left_mask right_filtered = code & right_mask oncoming_filtered = code & oncoming_mask print '| Traffic state: ', if left_filtered == 0: print "Left: None", elif left_filtered == 1: print "Left: Forward", elif left_filtered == 2: print "Left: Right", elif left_filtered == 3: print "Left: Left", print '-+-', if right_filtered == 0: print "Right: None", elif right_filtered == 4: print "Right: Forward", elif right_filtered == 8: print "Right: Right", elif right_filtered == 12: print "Right: Left", print '-+-', if oncoming_filtered == 0: print "Oncoming: None", elif oncoming_filtered == 16: print "Oncoming: Forward", elif oncoming_filtered == 32: print "Oncoming: Right", elif oncoming_filtered == 48: print "Oncoming: Left", def printTrafficLight(self, code): print '| ', if code == 0: print "Light: Red", else: print "Light: Green",
def loadQTable(self): self.qtable = QTable(self.states, self.actions, getNextState=self._getNextState) self.qtable[self.states[0], self.actions[0]].nextState = self.states[0] self.qtable[self.states[-1], self.actions[1]].nextState = self.states[-1] for key in self.qtable:
def main(): """Main procedure""" # Init data structures qtable = QTable([-10, 10], \ [ \ (-AREA_SIZE, AREA_SIZE, 8), \ (-1, 1, 10), \ (-SAFE_ANGLE_RAD, SAFE_ANGLE_RAD, 28), (-1, 1, 28) \ ]) # Inverted pendulum model initial_state = (0.0, 0.0, 0.0, 0.0) model = Model(initial_state, AREA_SIZE, SAFE_ANGLE_RAD) # Reinforcement learning qtable.learn(model, LEARNING_ITERATION, \ MAX_STATE_TRANSITIONS, \ SIMULATION_TIME_DELTA, \ LEARNING_RATE, \ DISCOUNT_FACTOR) # Visualize QTable qtable.draw(os.path.join(_DIR, "./../output/qtable.png")) # Clear temporary files delete_temp_files() # Run inverted pendulum system simulation model.reset() for i in range(0, round(SIMULATION_TIME / SIMULATION_TIME_DELTA) + 1): print("%.2fsec" % (i * SIMULATION_TIME_DELTA), \ model.get_state(), \ qtable.get_q_vals(model.get_state())) print(model.is_system_safe()) if not model.is_system_safe(): print("FAIL") break force = qtable.get_best_action(model.get_state()) model.simulate(force, SIMULATION_TIME_DELTA) model.draw_state(force, os.path.join(_DIR, \ "./../output/state_%06dms.png" % \ (i)), qtable) # Generate video print("\nGenerating video...", end="") sys.stdout.flush() if _platform == "linux": # GNU/Linux subprocess.call(os.path.join(_DIR, "./../make_video.sh"), shell=True) elif _platform == "darwin": # OS X pass elif _platform == "win32" or _platform == "cygwin": # Windows... subprocess.call(os.path.join(_DIR, "./../make_video.bat"), shell=True) import winsound freq = 2500 dur = 1000 winsound.Beep(freq, dur) print("DONE")
# Num Observation Min Max # 0 Cart Position -4.8 4.8 # 1 Cart Velocity -Inf Inf # 2 Pole Angle -0.418 rad (-24 deg) 0.418 rad (24 deg) # 3 Pole Velocity -Inf Inf bounds = list(zip(env.observation_space.low, env.observation_space.high)) # Velocity bounds by default are infinite, so rebind them. bounds[1] = [-1, 1] bounds[3] = [-math.radians(50), math.radians(50)] observation_space = discretize_observation_space( bounds, [15, 5, math.radians(1), math.radians(2)]) actions = [i for i in range(env.action_space.n)] table = QTable(observation_space, actions) prev_score = 0 for episode in range(n_episodes): observation = env.reset() state_action_pairs = [] t_steps_taken = 0 while True: env.render() state = discretize_state(observation, observation_space) action = table.decide_action(state) state_action_pairs.append((state, action)) observation, reward, done, info = env.step(action) t_steps_taken += 1
def state_space_present_test(): w = World(5, 5, [(1, 1, 3), (2, 2, 4)], [(3, 3, 3), (4, 4, 4)], -1, 13, 13) q = QTable(w) assert (len(q._table) == 5 * 5 * 2 * 2 * 2 * 2 * 2)
scheduler = 5 with open('waiting_time_{0}.csv'.format(scheduler), mode='w') as waiting_time_file, \ open('action_selection.csv', mode='w') as action_selection: waiting_time_file = csv.writer(waiting_time_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) action_selection = csv.writer(action_selection, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) simulator = LogicSimulator(waiting_time_file=waiting_time_file, action_file=action_selection) simulator.schedulers = [ FifoScheduler(simulator), LQFScheduler(simulator), FixedTimeScheduler(simulator, 300), FixedTimeScheduler(simulator, 200), FixedTimeScheduler(simulator, 400), #PrioWEScheduler(env) ] agent = QTable(256, len(simulator.schedulers), simulator.schedulers) agent.load_table() done = False hour = 1 state = simulator.get_state() while not done: state, _, done = simulator.step(agent.act(state, greedy=False)) #_, _, done = simulator.step(scheduler) if (simulator.time % simulator.time_steps_per_hour == 0): print('Simulating hour: {0}'.format(hour)) simulator.save_stats() hour += 1 """ plt.subplot(4,1,1) plt.plot(simulator.x, simulator.ny, 'b',label='NORTH') plt.legend(loc='upper left')
def main(): # print(colorsys.rgb_to_hsv(86, 201, 123)) # exit() f = open("results.txt", "a") red = (217, 41, 56) purple = (148, 105, 191) blue = (36, 132, 191) green = (50, 166, 46) orange = (242, 98, 15) expColors = [blue, orange, green, red, purple] render = False seedC = 42 pygame.init() clock = pygame.time.Clock() for run in range(2): plot1Surface = pygame.Surface((580, 440)) plot1Surface.fill((199, 189, 189)) plot2Surface = pygame.Surface((480, 440)) plot2Surface.fill((199, 189, 189)) # plt.show() np.random.seed(seedC) frameRate = 10 cellSize = 40 agentSize = 4 mainSurfaceSize = (1380, 820) flags = DOUBLEBUF mainSurface = pygame.display.set_mode(mainSurfaceSize, flags) mainSurface.set_alpha(None) pygame.display.set_caption("QLearning and SARSA") mainSurface.fill((199, 189, 189)) pickupPoints = [(0, 0), (2, 2), (4, 4)] dropoffPoints = [(1, 4), (4, 0), (4, 2)] NUM_STATES = 50 NUM_ACTIONS = 6 qtableLocation = (1100, 0) numGrid = (5, 5) pickupItemCount1 = [5, 5, 5] dropoffItemCount1 = [0, 0, 0] startingState = State(0, 4, 0) startLocation1 = (0, 0) world1 = PDWorld(startLocation1, cellSize, mainSurfaceSize, numGrid, startingState, agentSize, pickupPoints, dropoffPoints, pickupItemCount1, dropoffItemCount1) pickupItemCount2 = [5, 5, 5] dropoffItemCount2 = [0, 0, 0] startingState2 = State(0, 4, 0) startLocation2 = (220, 0) world2 = PDWorld(startLocation2, cellSize, mainSurfaceSize, numGrid, startingState, agentSize, pickupPoints, dropoffPoints, pickupItemCount2, dropoffItemCount2) pickupItemCount3 = [5, 5, 5] dropoffItemCount3 = [0, 0, 0] startingState3 = State(0, 4, 0) startLocation3 = (440, 0) world3 = PDWorld(startLocation3, cellSize, mainSurfaceSize, numGrid, startingState, agentSize, pickupPoints, dropoffPoints, pickupItemCount3, dropoffItemCount3) pickupItemCount4 = [5, 5, 5] dropoffItemCount4 = [0, 0, 0] startingState4 = State(0, 4, 0) startLocation4 = (660, 0) world4 = PDWorld(startLocation4, cellSize, mainSurfaceSize, numGrid, startingState, agentSize, pickupPoints, dropoffPoints, pickupItemCount4, dropoffItemCount4) pickupItemCount5 = [5, 5, 5] dropoffItemCount5 = [0, 0, 0] startingState5 = State(0, 4, 0) startLocation5 = (880, 0) world5 = PDWorld(startLocation5, cellSize, mainSurfaceSize, numGrid, startingState, agentSize, pickupPoints, dropoffPoints, pickupItemCount5, dropoffItemCount5) policy1 = Policy(PolicyType.RANDOM) policy2 = Policy(PolicyType.RANDOM) policy3 = Policy(PolicyType.RANDOM) policy4 = Policy(PolicyType.RANDOM) policy5 = Policy(PolicyType.RANDOM) qtable1 = QTable(NUM_STATES, NUM_ACTIONS, mainSurfaceSize, qtableLocation, Populate.ZEROS, world1) qtable2 = QTable(NUM_STATES, NUM_ACTIONS, mainSurfaceSize, qtableLocation, Populate.ZEROS, world2) qtable3 = QTable(NUM_STATES, NUM_ACTIONS, mainSurfaceSize, qtableLocation, Populate.ZEROS, world3) qtable4 = QTable(NUM_STATES, NUM_ACTIONS, mainSurfaceSize, qtableLocation, Populate.ZEROS, world4) qtable5 = QTable(NUM_STATES, NUM_ACTIONS, mainSurfaceSize, qtableLocation, Populate.ZEROS, world5) # r1 = RLearning(1, world1, qtable1, policy1, RL.Q_LEARNING, 0.3, 0.5, 0.2, 0, 8000, 0, f) r2 = RLearning(2, world2, qtable2, policy2, RL.Q_LEARNING, 0.3, 0.5, 0.2, 0, 8000, 0, f) r3 = RLearning(3, world3, qtable3, policy3, RL.SARSA, 0.3, 0.5, 0.2, 0, 8000, 0, f) r4 = RLearning(4, world4, qtable4, policy4, RL.SARSA, 0.3, 1, 0.2, 0, 8000, 0, f) r5 = RLearning(5, world5, qtable5, policy5, RL.Q_LEARNING, 0.3, 0.5, 0.2, 0, 8000, 0, f) qtables = [qtable1, qtable2, qtable3, qtable4, qtable5] rl = [r1, r2, r3, r4, r5] currentStates = [] nextStates = [] for i in range(len(rl)): currentStates.append(startingState) nextStates.append(startingState) i = 0 for r in rl: r.world.qtable = qtables[i] i += 1 r.nextEpisode() # pygame.time.wait(1500) selected = 0 for step in range(8000): mainSurface.fill((199, 189, 189)) # if step == 400: # frameRate = 0.5 clickBoxes = [] for r in range(len(rl)): minX = rl[r].world.startLocation[0] minY = rl[r].world.startLocation[1] maxX = rl[r].world.startLocation[ 0] + cellSize * numGrid[0] + 10 + 6 maxY = rl[r].world.startLocation[ 1] + cellSize * numGrid[1] + 10 + 6 clickBoxes.append([minX, maxX, minY, maxY]) event = pygame.event.poll() if event.type == pygame.QUIT: exit() if event.type == pygame.MOUSEBUTTONDOWN: mousex, mousey = pygame.mouse.get_pos() for i in range(len(rl)): if mousex > clickBoxes[i][0] and mousex < clickBoxes[i][ 1] and mousey > clickBoxes[i][ 2] and mousey < clickBoxes[i][3]: selected = i # if r.expNum-1 == selected: # r.world.selected = True # print(clickBoxes[selected], mousex, mousey) # # else: # r.world.selected = False for r in rl: r.color = expColors[r.expNum - 1] # print(r.r) currentStates[r.expNum - 1] = r.s event = pygame.event.poll() if event.type == pygame.QUIT: exit() if event.type == MOUSEBUTTONDOWN: # # pygame.display.update() # mainSurface.fill((199, 189, 189)) if event.button == 3: for ri in range(len(rl)): if selected == ri: if rl[ri].qtable.selected == 1: rl[ri].qtable.selected = 0 else: rl[ri].qtable.selected = 1 expN = r.expNum if expN - 1 == selected: r.world.selected = True r.world.colorMode = True else: r.world.selected = False r.world.colorMode = False # if step == 3999 or step == 199 or step == r.steps: # original = [] # for i in range(5): # original.append(rl[i].world.state.b) # for j in range(2): # txt = ["_without_package.png","_with_package.png"] # rl[i].world.state.b = j # # rl[i].update() # # mainSurface.fill(Color.VL_GREY) # # pygame.display.update() # rl[i].draw(mainSurface) # pygame.display.update() # filename = 'Run_' + str(run+1) + '_Experiment_' + str(i+1) + '_' +str(step) + txt[j] # # 422 + 274 # # qtables[i].update() # qtables[i].draw(mainSurface) # qtableSurface = pygame.Surface((422,816)) # qtableSurface.blit(qtables[i].surface,(0,0)) # surface = pygame.Surface((274,410)) # surface.fill((199, 189, 189)) # surface.blit(rl[i].world.surface,(0,0)) # surface.blit(rl[i].surface,(12,274)) # pygame.image.save(qtableSurface,'Run_'+str(run+1)+'_Experiment_'+str(i+1)+'_qtable_'+txt[j]+'.png') # pygame.image.save(surface,filename) # rl[i].world.state.b = original[i] if r.expNum == 1 and step == 4000: r.policy.switchPolicy(PolicyType.GREEDY) if r.expNum == 2 and step == 200: r.policy.switchPolicy(PolicyType.EXPLOIT) if r.expNum == 3 and step == 200: r.policy.switchPolicy(PolicyType.EXPLOIT) if r.expNum == 4 and step == 200: r.policy.switchPolicy(PolicyType.EXPLOIT) if r.expNum == 5 and step == 200: r.policy.switchPolicy(PolicyType.EXPLOIT) if r.expNum == 5 and r.isTerminalState(): r.world.dropoffPoints = pickupPoints r.world.pickupPoints = dropoffPoints # exit() # actns = r.world.getApplicableActions(r.world.state) # # if not r.isTerminalState(): # newstate = r.applyaction(r.world.state,np.random.choice(actns)) # r.world.state = newstate if r.isTerminalState(): r.nextEpisode() r.minStep.append(r.currentStep) r.currentStep = 0 r.world.reset() r.update() # r.world.draw(mainSurface) for r in rl: r.draw(mainSurface) if step < r.steps: r.nextStep() qtables[selected].update() qtables[selected].draw(mainSurface) for r in range(len(rl)): if r == selected: color = (0, 0, 0) offsetx = 0 offsety = 29 startL = (rl[selected].world.startLocation[0] + cellSize * numGrid[0] + offsetx, rl[selected].world.startLocation[1] + cellSize * numGrid[1] + offsety) startL1 = (rl[selected].world.startLocation[0] + cellSize * numGrid[0] + offsetx + 15, rl[selected].world.startLocation[1] + cellSize * numGrid[1] + offsety) startL2 = (rl[selected].world.startLocation[0] + cellSize * numGrid[0] + offsetx + 15, rl[selected].world.startLocation[1] + cellSize * numGrid[1] + offsety + 135) startL3 = (1095, 364) startLL = (1095, 415) # pygame.draw.circle(mainSurface, color, startL1, 2) # pygame.draw.circle(mainSurface, color, startL2, 2) # pygame.draw.circle(mainSurface, color, startLL, 2) pygame.draw.line(mainSurface, expColors[selected], startL, startL1, 2) pygame.draw.line(mainSurface, expColors[selected], startL1, startL2, 2) pygame.draw.line(mainSurface, expColors[selected], startL2, startL3, 2) pygame.draw.line(mainSurface, expColors[selected], startL3, startLL, 2) pygame.draw.circle(mainSurface, (255, 255, 255), startL, 7) pygame.draw.circle(mainSurface, expColors[selected], startL, 5) pygame.draw.circle(mainSurface, (255, 255, 255), startL, 3) pygame.draw.circle(mainSurface, color, startLL, 4) # lw = 38 # startColorCo = (rl[r].world.startLocation[0] + 10, # rl[r].world.startLocation[1] + cellSize * numGrid[1] + lw) # # pygame.draw.circle(mainSurface, purple, startColorCo, 2) # # startColorCoE = (rl[r].world.startLocation[0] + cellSize * numGrid[0] + 8, # rl[r].world.startLocation[1] + cellSize * numGrid[1] + lw) # pygame.draw.line(mainSurface,expColors[r],startColorCo,startColorCoE, 6) for r in rl: nextStates[r.expNum - 1] = r.s if step % 1 == 0: # plt.figure(figsize=(5,5)) plot1Surface.fill((199, 189, 189)) fig, ax = plt.subplots(figsize=(6.2, 4.4), facecolor='#C7BDBD') canvas = agg.FigureCanvasAgg(fig) t = range(step + 1) ax.set(xlabel='step', ylabel='reward', title='Step vs Reward') ax.set_facecolor('#C7BDBD') mpl.rcParams['legend.facecolor'] = '#C7BDBD' mpl.rcParams["legend.fancybox"] = False for r in rl: s = r.rewardPerTimeStep lt = 1 ls = ':' if r.expNum - 1 == selected: lt = 2.0 ls = '-' ax.plot(t, s, label='Exp ' + str(r.expNum), linewidth=lt, linestyle=ls) ax.grid() ax.legend() canvas.draw() renderer = canvas.get_renderer() raw_data = renderer.tostring_rgb() size = canvas.get_width_height() image = pygame.image.fromstring(raw_data, size, "RGB") # fig.savefig("stepVreward.png", transparent=True) # image = pygame.image.load('stepVreward.png') # image = pygame.transform.scale(image,(400,400)) plt.close('all') rect = image.get_rect() plot1Surface.blit(image, rect) # plot2Surface.fill((199, 160, 189)) for r in rl: if r.isTerminalState(): fig1, ax1 = plt.subplots(figsize=(4.8, 4.4)) ax1.set(xlabel='s/e', ylabel='steps', title='steps per terminal episode') plot2Surface.fill((199, 189, 189)) for rk in rl: t = range(rk.episodes - 1) s = rk.minStep ax1.plot(t, s, marker='o') fig1.savefig('sevs.png', transparent=True) image1 = pygame.image.load('sevs.png') rect1 = image1.get_rect() plot2Surface.blit(image1, rect1) mainSurface.blit(plot1Surface, (10, 370)) mainSurface.blit(plot2Surface, (610, 370)) plt.close('all') pygame.display.update() clock.tick(frameRate) seedC += 1 for r in rl: l = str(run + 1) + ' ' + str(r.expNum) + ' ' f.write(l) r.saveRunStatistics() f.close()
def manager(world, agent, learning_function, learning_rate, discount_rate, policy, num_steps, setup=None, swap_after_iter=None, filename="give_me_a_name.txt", state_space='big'): """ This function is kinda like the main, it will run the given learning_function on the given world with the given learning rate, discount rate and policy Parameters: world (World) An instance of a World Object representing the world agent (Agent): An instance of a Agent Object representing the agent learning_function (function): A fuinction to call with the world, agent, qtable, learning rate, discount rate and policy as parameters that will decide where the agent should move and also update the qtable learning_rate (float) discount_rate (float) policy (function) The function for the given policy num_steps (int) how many steps to run for setup (list of tuples) this is an optional parameter it represents different policies that should be activated after a particular number of steps if this parameter is supplied to the function, it is expected to be a list of tupes, where each of the tuples is expected to be of this format: (integer, function) where the integer is the number of steps, and function is the policy to switch to after that many steps have been ran """ if not setup: setup = [] q = QTable(world, state_space=state_space) current_step = 0 # Set this to None here for the SARSA algorithm. We won't be action = None next_action = None iteration = 1 movements = [] steps_this_iter = 0 steps_per_iter = [] heatmap = [[0 for _ in range(world._w)] for __ in range(world._h)] swapped = False while current_step < num_steps: movements.append((agent.get_position(), agent.is_holding_block())) heatmap[agent.get_position()[1]][agent.get_position()[0]] += 1 if is_world_solved(world, agent): world.reset_world() agent.reset_to_start() iteration += 1 steps_per_iter.append(steps_this_iter) steps_this_iter = 0 steps_this_iter += 1 if swap_after_iter and (swap_after_iter + 1) == iteration and not swapped: world.swap_pickup_dropoff() swapped = True # The policy will tell us what our next action will be # # We have to also compute the next action because SARSA requires this # information. We just return a new agent object that has pretended to # move for the first action to the policy function. # # There is probably better way of doing this if not action: action = policy(agent, world, q, state_space=state_space) next_action = policy(agent.pretend_move(action), world, q, state_space=state_space) else: action = next_action next_action = policy(agent.pretend_move(action), world, q, state_space=state_space) # Update the q table based on the state we are in and the action we # have chosen learning_function(world, agent, q, action, next_action, learning_rate, discount_rate, state_space=state_space) # If we are on a pick up and don't have a block, pick it up. If we are # on a drop off and have a block, drop it off pickup_dropoff(world, agent) # Move to our new location based on our action agent.move(action) current_step += 1 policy = get_new_policy(setup, current_step, policy) write_experiment_output(OUT_DIR, filename, world, agent, q, policy, iteration, movements, heatmap, state_space, steps_per_iter)
def __init__(self): self.qtable = QTable()