def pathplanning(self): global root global view RL = QLearningTable(actions=list(range(self.n_actions)), learning_rate=self._learningrate, reward_decay=self._discountfactor, e_greedy=self._egreedy) # update qtable self.currentqtable = str(RL.q_table) for episode in range(self._maxepisode): # update episode self.currentepisode = episode + 1 # reset self._robot = self._start.copy() # initialize observation observation = str(self._robot) time.sleep(1) while True: # record the final path if (episode == self._maxepisode - 1): self.finalpath.append( str("(" + str(int(self._robot[0])) + "," + str(int(self._robot[1])) + ")")) # choose action action = RL.choose_action(observation) # get new observation next_observation, reward, done = self.step(action) # learn from this observation RL.learn(observation, action, reward, next_observation) # update observation observation = next_observation # update qtable self.currentqtable = str(RL.q_table) # sleep for qml's update time.sleep(0.2) # print("#######") if done: break # print(self.finalpath) self.isfinalpath = True
def main(): env = Maze() RL = QLearningTable(actions=list(range(env.n_actions))) for episode in range(100): if episode % 200 == 0: RL.save_q_table() # initial observation observation = env.reset() counter = 0 while True: # fresh env env.render() print("Round: " + str(counter)) # RL choose action based on observation action = RL.choose_action(observation) # RL take action and get next observation & reward observation_, reward, done = env.step(action) # RL learn from this transition RL.learn(observation, action, reward, observation_, done) # swap observation observation = observation_ # break while loop when end of this episode if done: # RL.save_q_table() break else: time.sleep(1) counter += 1 # end game print("end game") # save q_table RL.save_q_table()
def update(): start = time.time() RL = QLearningTable(n_states=nodes_num, each_services_nums=each_services_nums, max_services_num=max_services_num, nodeSet_file=nodeSet_file, conf_file=conf_file, learning_rate=ALPHA, reward_decay=GAMMA, e_greedy=EPSILON) max_reward = 0 for episode in range(MAX_EPISODES): # initial observation state = 0 # print("episode = {}".format(episode)) while True: # RL choose action based on observation action = RL.choose_action(state) # RL take action and get next observation and reward state_, reward, done = RL.step(state, action) # print("s = {0}, a = {1}, s_ = {2}, reward = {3}".format( # state, action, state_, reward # )) # RL learn from this transition RL.learn(state, action, reward, state_) # swap observation state = state_ # break while loop when end of this episode if done: # print("services = {0}, reward = {1}, runtime = {2}, episode = {3} ".format # (RL.choose_services, reward, time.time()-start, episode)) if episode == 0: max_reward = reward else: if reward > max_reward: max_reward = reward print( "services = {0}, reward = {1}, runtime = {2}, episode = {3} " .format(RL.choose_services, reward, time.time() - start, episode)) line = [x for x in RL.choose_services] line.append(reward) line.append(time.time() - start) line.append(episode) # print(line) fp = open(outfile, 'a+') fp.write(str(line) + '\n') fp.close() else: if episode % 100 == 0: print("episode = {}".format(episode)) break # 终止条件 if episode >= ERROR_COUNT: del judge_list[0] judge_list.append(reward) if episode >= 1000 and episode % ERROR_COUNT == 0: if max(judge_list) - min(judge_list) <= ERROR_RANGE: output = "\n 达到收敛条件,提前终止实验!\n" line = [x for x in RL.choose_services] line.append(reward) line.append(time.time() - start) line.append(episode) # 打印收敛结果 print(output) print(line) # 记录收敛结果 fp = open(outfile, 'a+') fp.write(output) fp.write(str(line) + '\n') fp.close() break print('game over')
import environment from RL_brain import QLearningTable import numpy as np env = Maze() RL = QLearningTable(actions=list(range(env.n_actions))) N = 20 dt = 2 * np.pi / N ep_max = 500 fidelity = np.zeros(ep_max) RL = QLearningTable(actions=list(range(env.n_actions))) fid_10 = 0 for episode in range(ep_max): observation = env.reset() while True: action = RL.choose_action(str(observation)) observation_, reward, done, fid = env.step(action) RL.learn(str(observation), action, reward, str(observation_)) observation = observation_ if done: if episode >= ep_max - 11: fid_10 = max(fid_10, fid) break print('Final_fidelity=', fid_10)
class App: def __init__(self, master): self.master = master # grid map setting self.grid_origx = 500 self.grid_origy = 20 self.grid_columnNum = 8 self.grid_rowNum = 8 self.grid_UNIT = 90 self.maze_size = self.grid_columnNum * self.grid_rowNum # define total training episodes self.episode = 1000 # define number of tests to run self.tests = 100 # set a small amount of delay (second) to make sure tkinter works properly # if want to have a slower visulazation for testing, set the delay to larger values self.timeDelay = 0.005 # other initialization self.n_actions = 4 self.outline = 'black' self.fill = None self.item_type = 0 self.learning = False self.itemsNum = 0 self.epsilon = 0.9 self.Qtable_origx = self.grid_origx + 20 + (self.grid_columnNum + 1) * self.grid_UNIT self.Qtable_origy = self.grid_origy self.grid_origx_center = self.grid_origx + self.grid_UNIT / 2 self.grid_origy_center = self.grid_origy + self.grid_UNIT / 2 self.Qtable_gridIndex_dict = {} self.show_q_table = pd.DataFrame(columns=list(range(self.n_actions)), dtype=np.float64) self.origDist = 10 self.agentCentre = np.array([[190, 180], [290, 180], [390, 180]]) self.warehouseCentre = self.agentCentre+np.array([[0,self.grid_UNIT+self.origDist],\ [0,self.grid_UNIT+self.origDist],[0,self.grid_UNIT+self.origDist]]) self.ObstacleCentre1 = np.array([[725, 515], [725, 335], [635, 695]]) self.ObstacleCentre2 = np.array([[905, 245], [545, 245], [995, 605]]) self.itemOrigPosition = [] self.agentPosition_list = [] self.warehousePostition_list = [] self.ObstaclePosition_list = [] self.WarehouseItemIndex = [] self.agentItemIndex = [] self.ObstacleItemIndex = [] self.AllItemsOrigPosition_list = [] self.createMark = None self.points = [] self.cars_list = [] self.selected_agent = [] self.selected_agent_position = [] self.selected_Obstacles_position = [] self.selected_Obstacles = [] self.selected_targets = [] self.agent = 1 self.target = 4 self.hell1 = 7 self.hell2 = 8 self.init_widgets() self.temp_item = None self.temp_items = [] self.choose_item = None self.created_line = [] self.lines = [] def resize(self, w, h, w_box, h_box, pil_image): ''''' resize a pil_image ''' return pil_image.resize((w_box, h_box), Image.ANTIALIAS) def init_widgets(self): self.cv = Canvas(root, background='white') self.cv.pack(fill=BOTH, expand=True) # bind events of dragging with mouse self.cv.bind('<B1-Motion>', self.move) self.cv.bind('<ButtonRelease-1>', self.move_end) self.cv.bind("<Button-1>", self.leftClick_handler) # bind events of double-left-click self.cv.bind("<Button-3>", self.rightClick_handler) f = ttk.Frame(self.master) f.pack(fill=X) self.bns = [] # initialize buttons for i, lb in enumerate( ('Reset', 'Start trainning', 'Close', 'Save', 'Start Running')): bn = Button(f, text=lb, command=lambda i=i: self.choose_type(i)) bn.pack(side=LEFT, ipadx=8, ipady=5, padx=5) self.bns.append(bn) self.bns[self.item_type]['relief'] = SUNKEN #initialize agent, warehouses and obstacles positions self.agentPosition_list = self.setItemsPositionList(self.agentCentre) self.warehousePostition_list = self.setItemsPositionList( self.warehouseCentre) self.ObstaclePosition_list1 = self.setItemsPositionList( self.ObstacleCentre1) self.ObstaclePosition_list2 = self.setItemsPositionList( self.ObstacleCentre1) self.ObstaclePosition_list = self.ObstaclePosition_list1 + self.ObstaclePosition_list2 self.create_items() self.itemsNum = self.warehouseCentre.shape[ 0] + self.ObstacleCentre1.shape[0] + self.ObstacleCentre2.shape[ 0] + self.agentCentre.shape[0] R = self.grid_UNIT self.cv.create_text(self.agentCentre[0][0]-R-20,self.agentCentre[0][1],\ text = "Agent:",font=('Courier',18)) self.cv.create_text(self.warehouseCentre[0][0]-R-20,self.warehouseCentre[0][1],\ text = "Warehouse:",font=('Couried',18)) self.cv.create_text(self.grid_origx+250,self.grid_origy-50, text = "Single agent Q-Learning Simulation",\ font=('Times',38),fill = 'red') self.cv.create_text(self.grid_origx+252,self.grid_origy-52, text = "Single agent Q-Learning Simulation",\ font=('Times',38),fill = 'green') #draw grids self.create_grids(self.grid_origx, self.grid_origy, self.grid_columnNum, self.grid_rowNum, self.grid_UNIT) for i in range(0, self.grid_rowNum): for j in range(0, self.grid_columnNum): x = i * self.grid_UNIT + self.grid_origx_center y = j * self.grid_UNIT + self.grid_origy_center rowIndex = (y - self.grid_origy_center) / self.grid_UNIT columnIndex = (x - self.grid_origx_center) / self.grid_UNIT self.Qtable_gridIndex_dict[( x, y)] = rowIndex * self.grid_columnNum + columnIndex print(self.Qtable_gridIndex_dict) def create_ObsItems(self): self.cv.arriveObsImage = [] self.cv.bms_obs = [] w_box, h_box = self.grid_UNIT, self.grid_UNIT pil_image = Image.open('obs5.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image1 = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_obs.append(tk_image1) pil_image = Image.open('obs7.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image2 = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_obs.append(tk_image2) pil_image = Image.open('obs8.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image3 = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_obs.append(tk_image3) self.cv.bms_obs.append(tk_image1) self.cv.bms_obs.append(tk_image2) self.cv.bms_obs.append(tk_image3) self.cv.Obstacle = [] index = 0 for q in self.ObstacleCentre1: bm = self.cv.bms_obs[index] t = self.cv.create_image(q[0], q[1], image=bm) self.cv.Obstacle.append(t) self.AllItemsOrigPosition_list.append(self.cv.coords(t)) index += 1 for q in self.ObstacleCentre2: bm = self.cv.bms_obs[index] t = self.cv.create_image(q[0], q[1], image=bm) self.cv.Obstacle.append(t) self.AllItemsOrigPosition_list.append(self.cv.coords(t)) index += 1 #arriving picture pil_image = Image.open('obs5_car.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.arriveObsImage.append(tk_image) def create_targetItems(self): self.cv.arriveImage = [] self.cv.bms_wh = [] w_box, h_box = self.grid_UNIT, self.grid_UNIT pil_image = Image.open('warehouse4_1.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_wh.append(tk_image) pil_image = Image.open('warehouse3.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_wh.append(tk_image) pil_image = Image.open('warehouse4_2.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_wh.append(tk_image) self.cv.warehouse = [] index = 0 for q in self.warehouseCentre: bm = self.cv.bms_wh[index] t = self.cv.create_image(q[0], q[1], image=bm) self.cv.warehouse.append(t) self.AllItemsOrigPosition_list.append(self.cv.coords(t)) index += 1 #arriving picture pil_image = Image.open('warehouse3_car.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.arriveImage.append(tk_image) def create_agentItems(self): self.cv.bms = [] w_box, h_box = self.grid_UNIT, self.grid_UNIT pil_image = Image.open('car9.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms.append(tk_image) pil_image = Image.open('car2.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms.append(tk_image) pil_image = Image.open('car8.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms.append(tk_image) self.cv.car = [] index = 0 for q in self.agentCentre: bm = self.cv.bms[index] t = self.cv.create_image(q[0], q[1], image=bm) self.cv.car.append(t) self.AllItemsOrigPosition_list.append(self.cv.coords(t)) index += 1 def setItemsPositionList(self, itemCentre): npTemp = np.hstack((itemCentre, itemCentre)) # print("npTemp=",npTemp) h_u = self.grid_UNIT / 2 npHalfUnit = np.array([-h_u, -h_u, h_u, h_u]) hs = npHalfUnit for diam in range(1, itemCentre.shape[0]): hsTemp = np.vstack((npHalfUnit, hs)) hs = hsTemp # print("hs=",hs) return (npTemp - hs).tolist() def button_reset(self): time.sleep(self.timeDelay) if self.createMark is not None: self.cv.delete(self.createMark) for line in self.created_line: self.cv.delete(line) self.cv.coords(self.agent, self.selected_agent_position) coords = self.cv.coords(self.agent) return coords def reset(self): """ reset the agent to a random valid location """ if self.lines != []: for line in self.lines: self.cv.delete(line) Obs_list = self.ObstaclePosition_list while True: new_loc = [ random.randrange( self.grid_origx_center, self.grid_rowNum * self.grid_UNIT + self.grid_origx_center, self.grid_UNIT), random.randrange( self.grid_origy_center, self.grid_columnNum * self.grid_UNIT + self.grid_origy_center, self.grid_UNIT) ] if new_loc not in Obs_list: break self.cv.coords(self.selected_agent[0], new_loc) coords = self.cv.coords(self.selected_agent[0]) return coords def choose_best_action(self, state, terminal): """ choose best action from Q_table """ if terminal == self.cv.coords(self.target): q_table = self.q_table state_action = q_table.loc[state] action = np.random.choice( state_action[state_action == np.max(state_action)].index) return int(action) def run(self): """ main function for runing tests """ test = 0 rewards = [] action = -1 observation = self.cv.coords(self.agent) done = 0 total_reward = 0 terminal = self.cv.coords(self.target) visited = [observation] # enhance_list = [] win_count = 0 while True: self.labelHello = Label(self.cv, text="Test:%s" % str(test), font=("Helvetica", 10), width=10, fg="blue", bg="white") self.labelHello.place(x=self.agentCentre[0][0] - 150, y=self.agentCentre[0][1] + 500, anchor=NW) time.sleep(self.timeDelay) action = self.choose_best_action(str(observation), terminal) observation_ = self.calcu_next_state(observation, action) reward = self.new_reward(observation_, observation) if observation_ in visited: reward -= 0.5 else: visited.append(observation_) if done: observation_ = self.cv.coords(self.target) self.cv.coords(self.selected_agent[0], observation_) total_reward += reward if total_reward < -1: done = 1 if done != 1: line = self.cv.create_line( observation[0], observation[1], observation_[0], observation_[1], fill='red', arrow=LAST, arrowshape=(10, 20, 8), # 红色 dash=(4, 4) # 虚线 ) self.lines.append(line) observation = observation_ if self.cv.coords(self.agent) == self.cv.coords(self.target): done = 1 if done: action = -1 visited = [] total_reward += 1 if total_reward == 1: win_count += 1 rewards.append(total_reward) total_reward = 0 self.reset() done = 0 observation = self.cv.coords(self.agent) test += 1 if test > self.tests: self.labelHello = Label(self.cv, text="running end!!", font=("Helvetica", 10), width=10, fg="red", bg="white") self.labelHello.place(x=250, y=750, anchor=NW) break print("win_count", win_count) plt.figure() plt.title('Score per Episode') plt.xlabel('Episode number') plt.ylabel('Score') plt.plot(rewards) plt.show() def render(self): time.sleep(self.timeDelay) def format_time(self, seconds): if seconds < 400: s = float(seconds) return "%.1f seconds" % (s, ) elif seconds < 4000: m = seconds / 60.0 return "%.2f minutes" % (m, ) else: h = seconds / 3600.0 return "%.2f hours" % (h, ) def reward(self, s_, s): """ rewarding scheme """ self.target = self.selected_targets[0] if s_ == self.cv.coords(self.selected_targets[0]): t = self.cv.create_image(s_, image=self.cv.arriveImage[0]) self.createMarkA = t reward = 1 done = True elif s_ in self.selected_Obstacles_position: reward = -0.75 done = False else: reward = -0.04 done = False return reward, done def calcu_next_state(self, loc, action): """ calculate next state based on location and action """ UNIT = self.grid_UNIT ss = loc np_s = np.array(ss) dissS = np.array([self.grid_origx, self.grid_origy]) s = (np_s - dissS).tolist() base_action = np.array([0, 0]) if action == 0: # up if s[1] > UNIT: base_action[1] -= UNIT elif action == 1: # down if s[1] < (self.grid_rowNum - 1) * UNIT: base_action[1] += UNIT elif action == 2: # right if s[0] < (self.grid_columnNum - 1) * UNIT: base_action[0] += UNIT elif action == 3: # left if s[0] > UNIT: base_action[0] -= UNIT s_ = [] s_ = [ss[0] + base_action[0], ss[1] + base_action[1]] return s_ def new_reward(self, s_, s): """ rewarding scheme for testing """ if s_ == self.cv.coords(self.selected_targets[0]): t = self.cv.create_image(s_, image=self.cv.arriveImage[0]) self.createMark = t reward = 0 elif s_ in self.selected_Obstacles_position: reward = -2 else: reward = 0 return reward def update(self): """ main function for training """ self.RL = QLearningTable(actions=list(range(self.n_actions)), e_greedy=self.epsilon) episode = 0 action = -1 stepCount = 0 total_reward_list = [] avg_reward_list = [] win_history = [] observation = self.cv.coords(self.agent) visited = set() total_reward = 0 start_time = datetime.datetime.now() self.labelHello = Label(self.cv, text="start training!", font=("Helvetica", 10), width=10, fg="red", bg="white") self.labelHello.place(x=200, y=750, anchor=NW) while True: self.labelHello = Label(self.cv, text="episode: %s" % str(episode), font=("Helvetica", 10), width=10, fg="blue", bg="white") self.labelHello.place(x=200, y=550, anchor=NW) self.render() visited.add(tuple(observation)) stepCount += 1 action = self.RL.choose_action(str(observation)) observation_ = self.calcu_next_state(observation, action) reward, done = self.reward(observation_, observation) self.cv.coords(self.selected_agent[0], observation_) if tuple(observation_) in visited: reward -= 0.25 if observation == observation_: reward = reward - 0.8 if done == True: win_history.append(1) total_reward += reward if total_reward < -0.5 * 64: done = True win_history.append(0) self.RL.learn(str(observation), action, reward, str(observation_)) # swap observation observation = observation_ # break while loop when end of this episode if done: if episode > self.episode: break else: observation = self.reset() dt = datetime.datetime.now() - start_time t = self.format_time(dt.total_seconds()) total_reward_list.append(total_reward) if len(total_reward_list) > 100: avg_reward = sum(total_reward_list[-100:]) / 100 avg_reward_list.append(avg_reward) template = "Episode: {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | Average rewards: {:.3f} | time: {}" print( template.format( episode, self.episode, stepCount, sum(win_history) / len(win_history), total_reward, avg_reward, t)) else: template = "Episode: {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | time: {}" print( template.format( episode, self.episode, stepCount, sum(win_history) / len(win_history), total_reward, t)) episode += 1 stepCount = 0 total_reward = 0 visited = set() done = 0 # end of training print('training over!') self.labelHello = Label(self.cv, text="training end!", font=("Helvetica", 10), width=10, fg="red", bg="white") self.labelHello.place(x=200, y=750, anchor=NW) print("total_win_rate", sum(win_history) / len(win_history)) print("total_time", t) print("average rewards per episode", sum(total_reward_list) / len(total_reward_list)) self.learning = False self.reset() plt.figure() plt.title('Rewards per Episode') plt.xlabel('Episode number') plt.ylabel('Rewards') plt.plot(total_reward_list) plt.show() plt.figure() plt.title('Average Rewards over 100 Episode') plt.xlabel('Episode number') plt.ylabel('Rewards') plt.plot(avg_reward_list) plt.show() def create_items(self): self.AllItemsOrigPosition_list.append([0, 0, 0, 0]) self.create_agentItems() self.agentItemIndex = [1, len(self.agentPosition_list)] self.create_targetItems() self.WarehouseItemIndex = [ self.agentItemIndex[1] + 1, self.agentItemIndex[1] + len(self.warehousePostition_list) ] self.create_ObsItems() self.ObstacleItemIndex = [ self.WarehouseItemIndex[1] + 1, self.WarehouseItemIndex[1] + len(self.ObstaclePosition_list) ] def create_grids(self, origx, origy, column, row, UNIT): # create grids for c in range(origx, origx + (column + 1) * UNIT, UNIT): x0, y0, x1, y1 = c, origy, c, origy + row * UNIT self.cv.create_line(x0, y0, x1, y1, width=2) for r in range(origy, origy + (row + 1) * UNIT, UNIT): x0, y0, x1, y1 = origx, r, origx + row * UNIT, r self.cv.create_line(x0, y0, x1, y1, width=2) def choose_type(self, i): """ function of clicking different button """ for b in self.bns: b['relief'] = RAISED self.bns[i]['relief'] = SUNKEN self.item_type = i if self.item_type == 1: # start training self.start_learning() self.bns[i]['relief'] = RAISED elif self.item_type == 2: # close simulation tool os._exit(0) elif self.item_type == 3: # save q_table temp_s = str(self.cv.coords(self.target)) + str( self.selected_Obstacles_position) self.RL.q_table.to_csv("single_qtable_%s.csv" % temp_s, index_label="index_label") print("SAVED!!!") self.labelHello = Label(self.cv, text="table saved!!", font=("Helvetica", 10), width=10, fg="red", bg="white") self.labelHello.place(x=350, y=750, anchor=NW) elif self.item_type == 0: self.button_reset() elif self.item_type == 4: # start running tests self.start_running() elif self.item_type == 5: self.restart() def start_learning(self): """ initialization for training process """ self.selected_agent = [] self.selected_targets = [] self.selected_Obstacles = [] self.selected_agent_position = [] self.selected_Obstacles_position = [] for item in range(1, self.itemsNum + 1): p = self.cv.coords(item) if p[0] >= self.grid_origx and p[1] >= self.grid_origy: if item in range(self.agentItemIndex[0], self.agentItemIndex[1] + 1): self.selected_agent.append(item) self.selected_agent_position = p elif item in range(self.WarehouseItemIndex[0], self.WarehouseItemIndex[1] + 1): self.selected_targets.append(item) elif item in range(self.ObstacleItemIndex[0], self.ObstacleItemIndex[1] + 1): self.selected_Obstacles.append(item) self.selected_Obstacles_position.append(p) if len(self.selected_agent) == 0 or len(self.selected_agent) > 1: tkinter.messagebox.showinfo( "INFO", "Please choose ONE agent for trainning!") elif len(self.selected_targets) == 0 or len(self.selected_targets) > 1: tkinter.messagebox.showinfo( "INFO", "Please choose ONE target for trainning!") else: self.agent = self.selected_agent[0] self.target = self.selected_targets[0] self.t = threading.Timer(self.timeDelay, self.update) self.t.start() self.learning = True def start_running(self): """ initialization for testing """ self.selected_agent = [] self.selected_targets = [] self.selected_Obstacles = [] self.selected_agent_position = [] self.selected_Obstacles_position = [] self.selected_targets_position = [] for item in range(1, self.itemsNum + 1): p = self.cv.coords(item) if p[0] >= self.grid_origx and p[1] >= self.grid_origy: if item in range(self.agentItemIndex[0], self.agentItemIndex[1] + 1): self.selected_agent.append(item) self.selected_agent_position = p elif item in range(self.WarehouseItemIndex[0], self.WarehouseItemIndex[1] + 1): self.selected_targets.append(item) self.selected_targets_position = p elif item in range(self.ObstacleItemIndex[0], self.ObstacleItemIndex[1] + 1): self.selected_Obstacles.append(item) self.selected_Obstacles_position.append(p) if len(self.selected_agent) <= 0 or len(self.selected_agent) > 1: tkinter.messagebox.showinfo("INFO", "Please place ONE agent on map!") elif len(self.selected_targets) == 0 or len(self.selected_targets) > 1: tkinter.messagebox.showinfo("INFO", "Please choose ONE terminal!") else: self.agent = self.selected_agent[0] self.target = self.selected_targets[0] # load Q table terminal_str = str(self.selected_targets_position) + str( self.selected_Obstacles_position) + 'episode3000' self.q_table = pd.read_csv("table terminal%s.csv" % terminal_str, index_col=0) self.t = threading.Timer(self.timeDelay, self.run) self.t.start() self.learning = True def rightClick_handler(self, event): self.start_learning() def leftClick_handler(self, event): """ bind events of choosing warehouse """ if self.learning: print("Learing on going!") else: for item in range(1, self.itemsNum + 1): position = self.cv.coords(item) R = self.grid_UNIT / 2 p = [ position[0] - R, position[1] - R, position[0] + R, position[1] + R ] if event.x>=p[0] and event.x<=p[2] and \ event.y>=p[1] and event.y<=p[3]: t = item self.choose_item_handler(event, t) def choose_item_handler(self, event, t): self.choose_item = t self.itemOrigPosition = self.cv.coords(t) def move(self, event): if self.choose_item is not None: t = self.choose_item self.cv.coords(t, event.x, event.y) def adjust_items_into_grids(self, event): if self.choose_item is not None: t = self.choose_item position = self.cv.coords(t) centerX = position[0] centerY = position[1] Grids_X0 = self.grid_origx Grids_X1 = self.grid_origx + (self.grid_columnNum + 1) * self.grid_UNIT Grids_Y0 = self.grid_origy Grids_Y1 = self.grid_origy + (self.grid_rowNum + 1) * self.grid_UNIT if (centerX in range(Grids_X0, Grids_X1)) and (centerY in range( Grids_Y0, Grids_Y1)): columnIndex = math.floor((centerX - Grids_X0) / self.grid_UNIT) rowIndex = math.floor((centerY - Grids_Y0) / self.grid_UNIT) adjustedX0 = Grids_X0 + columnIndex * self.grid_UNIT + self.grid_UNIT / 2 adjustedY0 = Grids_Y0 + rowIndex * self.grid_UNIT + self.grid_UNIT / 2 self.cv.coords(t, adjustedX0, adjustedY0) else: #return to original position if not drag near grids self.cv.coords(t, self.AllItemsOrigPosition_list[t]) self.itemOrigPosition = [] def move_end(self, event): if self.choose_item is not None: t = self.choose_item self.adjust_items_into_grids(event) self.choose_item = None def delete_item(self, event): if self.choose_item is not None: self.cv.delete(self.choose_item)
r = '50000' save_list = [100, 50000] # ,10000,50000,100000,200000,300000,400000,500000,600000,700000,800000,900000,1000000 train = True env = envR(show=False) RL = QLearningTable(env.action_space, learning_rate=0.1) # step = 0 # succ = 0 # start = time.time() for episode in range(int(r)): pre_maps = env.reset() for i in range(100): action = RL.choose_action(str(pre_maps), train) reward, done, action_ = env.step(action) RL.learn(str(pre_maps), action, reward, str(env.get_maps()), done) pre_maps = env.get_maps() if done: break # step += 1 print((episode + 1)) if (episode + 1) in save_list: print("This is", episode + 1) test(RL) print('Training Over!')
def update(self): # TODO Start_Point & End_Point 待输入 for i in range(166, 288): np.random.seed(i) start_point = np.random.randint(0, 800) end_point = np.random.randint(801, 1725) RL = QLearningTable(self.actions) env = Cross(self.next_state_list, self.action_list, self.distance_list, start_point, end_point, self.cross_info) # update block time_start = time.time() for episode in range(100): # import SA T = 1000 epsilon, T = tools.SA(T, episode, 100, 0.95) RL.epsilon = epsilon if epsilon > 1: print("yes") print(epsilon) episode_start_time = time.time() plt.ion() observation = env.start_point prior_state = observation while True: index = RL.choose_action(observation, env, 1) observation_, reward, done = env.step( observation, index, prior_state) # print("observation_:", observation_, "observation:", observation, "prior_state:", prior_state) # 画图可视化 # plt.clf() # plt.scatter(self.x[start_point], self.y[start_point], marker='o', s=100, label='start_point', c='yellow') # plt.scatter(self.x[end_point], self.y[end_point], marker='^', s=100, label='end_point', c='yellow') # plt.scatter(self.x, self.y, s=15, alpha=0.3, c='green') # if observation_ == 'end_point': # plt.scatter(self.x[end_point], self.y[end_point], s=15, c='red') # elif observation_ == 'terminal': # plt.scatter(self.x[observation], self.y[observation], s=15, c='yellow') # else: # plt.scatter(self.x[observation_], self.y[observation_], s=15, c='red') # plt.pause(0.01) # plt.ioff() q_table = RL.learn(observation, index, reward, observation_, 1) # print(q_table.loc[observation_]) prior_state = observation observation = observation_ current_time = time.time() if current_time - episode_start_time > 60: break if done: break episode_end_time = time.time() print('==========================================') print(episode + 1, "th episode is completed, time cost:", episode_end_time - episode_start_time) print('==========================================') print(q_table) time_end = time.time() print('totally completely, time cost:', time_end - time_start) if 1 - bool( os.path.exists(os.getcwd() + '/table_' + str(configuration.Omega))): os.makedirs(os.getcwd() + '/table_' + str(configuration.Omega)) q_table.to_csv(os.getcwd() + '/table_' + str(configuration.Omega) + '/' + configuration.CITY + '_' + str(start_point) + '_' + str(end_point) + '_' + 'q_table.csv', encoding="utf-8")
action = RL.choose_action(observation) # a = crossover_1(a1,b1) child = globals()[action](a1, b1) solution3[counter] = child observation_ = str([int(i) for i in solution3]) reward = 0 fitness1 = 0 - sum(function1(i) for i in solution3) fitness2 = 0 - sum(function2(i) for i in solution3) reward = fitness1 + fitness2 RL.learn(observation, action, reward, observation_) counter += 1 solution2 = solution + solution3 ''' 生成新parents ''' function1_values2 = [ function1(solution2[i]) for i in range(0, 2 * POPULATION_SIZE) ] function2_values2 = [ function2(solution2[i]) for i in range(0, 2 * POPULATION_SIZE) ] non_dominated_sorted_solution2 = fast_non_dominated_sort( function1_values2[:], function2_values2[:])
def update_realtime(self): # error_point = [256, 512, 768, 3, 5, 778, 138, 779, 655, 786, 789, 793, 155, 34, 675, 420, 293, 424, 169, 428, 301, # 173, 431, 49, 306, 182, 439, 701, 189, 65, 322, 199, 456, 457, 461, 725, 599, 345, 732, 734, 351, # 98, 485, 742, 104, 490, 620, 750, 240, 753, 626, 116, 380] # error_point = [750, 240, 189, 155, 199, 485, 306, 457, 380, 626, 116, 461] error_point = [ 512, 5, 138, 779, 280, 155, 34, 675, 420, 424, 301, 430, 306, 439, 701, 189, 317, 63, 322, 199, 457, 461, 589, 725, 215, 599, 345, 732, 351, 609, 485, 620, 240, 626, 380 ] # time_start = time.time() error_list = [] # TODO Start_Point & End_Point 待输入 # delay_col = {'s_e', 'start_point', 'end_point', 'transfer', 'queue', 'process'} delay_df = pd.DataFrame(columns=('s_e', 'start_point', 'end_point', 'transfer', 'queue', 'process')) # delay_df = delay_df.append({'s_e': 'TASK_SIZE:'+str(configuration.TASK_SIZE),山西053乡道 # 'start_point:': 'CPU_CLOCK'+str(configuration.CPU_CLOCK), # 'end_point:': 'VEHICLE_POWER'+str(configuration.VEHICLE_POWER), # 'transfer': 000, # 'queue': 000, # 'process': 000}, # ignore_index=True) # x = [1, 2, 3, 4, 5, 6, 7, 8, 9] cost_list = [] # for z in range(10): time_start = time.time() count = 0 e_count = 0 for i in range(166, 288): flag = False # 随机种子,保证和第一次训练是相同的 np.random.seed(i) start_point = np.random.randint(0, 800) if start_point in error_point: continue count += 1 end_point = np.random.randint(801, 1725) print(start_point, '-->', end_point) # 读取已经存在本地的Q表 df_q_table = pd.read_csv( os.getcwd() + '/table_' + str(self.omega) + '/' + configuration.CITY + '_' + str(start_point) + '_' + str(end_point) + '_' + 'q_table.csv', encoding="utf-8") # print(os.getcwd() + '/table_' + str(self.omega) + '/' + configuration.CITY + '_' + # str(start_point) + '_' + str(end_point) + '_' + 'q_table.csv') df_q_table = df_q_table.set_index(['Unnamed: 0']) df_q_table = df_q_table[['1', '2', '3', '4']].astype(np.float64) RL = QLearningTable(self.actions) RL.gamma = configuration.VEHICLE_POWER # print(self.omega) # 贪心策略设置为1 # RL.epsilon = 0.95 # 更换Q表 RL.q_table = df_q_table env = Cross_2th(self.next_state_list, self.action_list, self.distance_list, start_point, end_point, self.cross_info, self.tel_list, self.df_tel, self.omega) # update block # for循环计数 index_for = 0 # for循环内延迟总和计算平均值 delay_for_sum = 0 transfer_for_sum = 0 queue_for_sum = 0 process_for_sum = 0 for episode in range(10): # import SA T = 1000 epsilon, T = tools.SA(T, episode, 10, 0.95) RL.epsilon = epsilon if epsilon > 1: print("yes") # print(epsilon) one_episode_start_time = time.time() # 画图 # plt.ion() observation = env.start_point prior_state = observation # while循环计数 index_while = 0 # while循环内延迟总和计算平均值 delay_while_sum = 0 transfer_while_sum = 0 queue_while_sum = 0 process_while_sum = 0 while True: index = RL.choose_action(observation, env, 2) observation_, reward, done, tel_delay, transfer_time, queue_time, process_time = \ env.step_2th(observation, index, prior_state) # print("observation_:", observation_, "observation:", observation, "prior_state:", prior_state) index_while += 1 delay_while_sum += tel_delay transfer_while_sum += transfer_time queue_while_sum += queue_time process_while_sum += process_time # 陷入局部最优跳出 current_time = time.time() if current_time - one_episode_start_time > 10: flag = True e_count += 1 print('error:', start_point, 'x--x', end_point) # if observation not in error_list: # error_list.append(start_point) break # 画图部分 # plt.clf() # plt.scatter(self.x[start_point], self.y[start_point], marker='o', s=100, label='start_point', # c='yellow') # plt.scatter(self.x[end_point], self.y[end_point], marker='^', s=100, label='end_point', c='yellow') # plt.scatter(self.x, self.y, s=15, alpha=0.3, c='green') # if observation_ == 'end_point': # plt.scatter(self.x[end_point], self.y[end_point], s=15, c='red') # elif observation_ == 'terminal': # plt.scatter(self.x[observation], self.y[observation], s=15, c='yellow') # else: # plt.scatter(self.x[observation_], self.y[observation_], s=15, c='red') # plt.pause(0.1) # plt.ioff() # df_q_table = RL.learn(observation, index, reward, observation_, 2) # print(q_table[ # q_table.index.values.tolist().index(str(29)):q_table.index.values.tolist().index( # str(29)) + 1]) # print(q_table[ # q_table.index.values.tolist().index(str(77)):q_table.index.values.tolist().index( # str(77)) + 1]) prior_state = observation observation = observation_ current_time = time.time() if done: break delay_while_avg = delay_while_sum / index_while transfer_while_avg = transfer_while_sum / index_while queue_while_avg = queue_while_sum / index_while process_while_avg = process_while_sum / index_while index_for += 1 delay_for_sum += delay_while_avg transfer_for_sum += transfer_while_avg queue_for_sum += queue_while_avg process_for_sum += process_while_avg one_episode_end_time = time.time() # print('==========================================') # print(episode + 1, "th episode is completed, time cost:", one_episode_end_time - one_episode_start_time) # print('==========================================') # print(q_table) if flag: break delay_avg = delay_for_sum / index_for transfer_avg = transfer_for_sum / index_for queue_avg = queue_for_sum / index_for process_avg = process_for_sum / index_for # print('transfer_avg is:', transfer_avg, 'queue_avg is:', queue_avg, 'process_avg is:', process_avg) delay_df = delay_df.append( { 's_e': str(start_point) + '_' + str(end_point), 'start_point': start_point, 'end_point': end_point, 'transfer': transfer_avg, 'queue': queue_avg, 'process': process_avg }, ignore_index=True) # print('======================================================================') # print(delay_df) dir_path = os.getcwd() + '/table_realtime_Ω_' + str( self.omega) + '_ts_' + str( configuration.TASK_SIZE) + '_cc_' + str( configuration.CPU_CLOCK) + '_vp_' + str( configuration.VEHICLE_POWER) # print(dir_path) if 1 - bool(os.path.exists(dir_path)): os.makedirs(dir_path) os.makedirs(dir_path + '/time_cost/') df_q_table.to_csv(dir_path + '/' + configuration.CITY + '_' + str(start_point) + '_' + str(end_point) + '_realtime_q_table.csv', encoding="utf-8") delay_df.to_csv(dir_path + '/time_cost/' + 'TASK_SIZE_' + str(configuration.TASK_SIZE) + '_CPU_CLOCK_' + str(configuration.CPU_CLOCK) + '_VEHICLE_POWER_' + str(configuration.VEHICLE_POWER) + '_time_cost.csv', encoding="utf-8") # 跳出z循环 # if count - e_count == 5*(z+1): # break time_end = time.time() time_cost = time_end - time_start - e_count * 10 c_minus = count - e_count # cost_pre = time_cost*(round(10/(count-e_count), 3)) print('totally completely, time cost:', time_cost) # print(c_minus) # print(cost_pre) print('==========================================') cost_list.append(time_cost) print(cost_list)
class App: def __init__(self, master): self.master = master # grid map setting self.grid_origx = 500 self.grid_origy = 20 self.grid_columnNum = 8 self.grid_rowNum = 8 self.grid_UNIT = 90 self.maze_size = self.grid_columnNum * self.grid_rowNum # define total training episodes self.episode = 5000 # define number of tests to run self.tests = 100 # set a small amount of delay (second) to make sure tkinter works properly # if want to have a slower visulazation for testing, set the delay to larger values self.timeDelay = 0.0005 # other initialization self.n_actions = 4 self.outline = 'black' self.fill = None self.item_type = 0 self.learning = False self.itemsNum = 0 self.epsilon = 0.9 self.Qtable_origx = self.grid_origx + 20 + (self.grid_columnNum + 1) * self.grid_UNIT self.Qtable_origy = self.grid_origy self.grid_origx_center = self.grid_origx + self.grid_UNIT / 2 self.grid_origy_center = self.grid_origy + self.grid_UNIT / 2 self.grid_endx = self.grid_origx + self.grid_columnNum * self.grid_UNIT self.grid_endy = self.grid_origy + self.grid_rowNum * self.grid_UNIT self.Qtable_gridIndex_dict = {} self.show_q_table = pd.DataFrame(columns=list(range(self.n_actions)), dtype=np.float64) self.origDist = 10 self.agentCentre = np.array([[190, 180], [290, 180], [390, 180]]) self.warehouseCentre = self.agentCentre+np.array([[0,self.grid_UNIT+self.origDist],\ [0,self.grid_UNIT+self.origDist],[0,self.grid_UNIT+self.origDist]]) self.ObstacleCentre1 = np.array([[725, 515], [725, 335], [635, 695]]) self.ObstacleCentre2 = np.array([[905, 245], [545, 245], [995, 605]]) self.itemOrigPosition = [] self.agentPosition_list = [] self.warehousePostition_list = [] self.ObstaclePosition_list = [] self.WarehouseItemIndex = [] self.agentItemIndex = [] self.ObstacleItemIndex = [] self.AllItemsOrigPosition_list = [] self.createMark = None self.points = [] self.cars_list = [] self.selected_agent = [] self.selected_agent_position = [] self.selected_Obstacles_position = [] self.selected_Obstacles = [] self.selected_targets = [] self.agent = 1 self.target = 4 self.hell1 = 7 self.hell2 = 8 self.init_widgets() self.temp_item = None self.temp_items = [] self.choose_item = None self.createMarkA = None self.createMarkB = None self.linesA = [] self.linesB = [] def resize(self, w, h, w_box, h_box, pil_image): ''''' resize a pil_image ''' return pil_image.resize((w_box, h_box), Image.ANTIALIAS) def init_widgets(self): self.cv = Canvas(root, background='white') self.cv.pack(fill=BOTH, expand=True) # bind events of dragging with mouse self.cv.bind('<B1-Motion>', self.move) self.cv.bind('<ButtonRelease-1>', self.move_end) self.cv.bind("<Button-1>", self.leftClick_handler) # bind events of double-left-click self.cv.bind("<Button-3>", self.rightClick_handler) f = ttk.Frame(self.master) f.pack(fill=X) self.bns = [] # initialize buttons for i, lb in enumerate( ('Reset', 'Start trainning', 'Close', 'Save', 'Start Running')): bn = Button(f, text=lb, command=lambda i=i: self.choose_type(i)) bn.pack(side=LEFT, ipadx=8, ipady=5, padx=5) self.bns.append(bn) self.bns[self.item_type]['relief'] = SUNKEN #initialize agent, warehouses and obstacles positions self.agentPosition_list = self.setItemsPositionList(self.agentCentre) self.warehousePostition_list = self.setItemsPositionList( self.warehouseCentre) self.ObstaclePosition_list1 = self.setItemsPositionList( self.ObstacleCentre1) self.ObstaclePosition_list2 = self.setItemsPositionList( self.ObstacleCentre1) self.ObstaclePosition_list = self.ObstaclePosition_list1 + self.ObstaclePosition_list2 self.create_items() self.itemsNum = self.warehouseCentre.shape[ 0] + self.ObstacleCentre1.shape[0] + self.ObstacleCentre2.shape[ 0] + self.agentCentre.shape[0] R = self.grid_UNIT self.cv.create_text(self.agentCentre[0][0]-R-20,self.agentCentre[0][1],\ text = "Agent:",font=('Courier',18)) self.cv.create_text(self.warehouseCentre[0][0]-R-20,self.warehouseCentre[0][1],\ text = "Warehouse:",font=('Couried',18)) self.cv.create_text(self.grid_origx+250,self.grid_origy-50, text = "Single agent Q-Learning Simulation",\ font=('Times',38),fill = 'red') self.cv.create_text(self.grid_origx+252,self.grid_origy-52, text = "Single agent Q-Learning Simulation",\ font=('Times',38),fill = 'green') #draw grids self.create_grids(self.grid_origx, self.grid_origy, self.grid_columnNum, self.grid_rowNum, self.grid_UNIT) for i in range(0, self.grid_rowNum): for j in range(0, self.grid_columnNum): x = i * self.grid_UNIT + self.grid_origx_center y = j * self.grid_UNIT + self.grid_origy_center rowIndex = (y - self.grid_origy_center) / self.grid_UNIT columnIndex = (x - self.grid_origx_center) / self.grid_UNIT self.Qtable_gridIndex_dict[( x, y)] = rowIndex * self.grid_columnNum + columnIndex print(self.Qtable_gridIndex_dict) def create_ObsItems(self): self.cv.arriveObsImage = [] self.cv.bms_obs = [] w_box, h_box = self.grid_UNIT, self.grid_UNIT pil_image = Image.open('obs5.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image1 = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_obs.append(tk_image1) pil_image = Image.open('obs7.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image2 = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_obs.append(tk_image2) pil_image = Image.open('obs8.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image3 = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_obs.append(tk_image3) self.cv.bms_obs.append(tk_image1) self.cv.bms_obs.append(tk_image2) self.cv.bms_obs.append(tk_image3) self.cv.Obstacle = [] index = 0 for q in self.ObstacleCentre1: bm = self.cv.bms_obs[index] t = self.cv.create_image(q[0], q[1], image=bm) self.cv.Obstacle.append(t) self.AllItemsOrigPosition_list.append(self.cv.coords(t)) index += 1 for q in self.ObstacleCentre2: bm = self.cv.bms_obs[index] t = self.cv.create_image(q[0], q[1], image=bm) self.cv.Obstacle.append(t) self.AllItemsOrigPosition_list.append(self.cv.coords(t)) index += 1 #arriving picture pil_image = Image.open('obs5_car.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.arriveObsImage.append(tk_image) def create_targetItems(self): self.cv.arriveImage = [] self.cv.bms_wh = [] w_box, h_box = self.grid_UNIT, self.grid_UNIT pil_image = Image.open('warehouse4_1.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_wh.append(tk_image) pil_image = Image.open('warehouse3.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_wh.append(tk_image) pil_image = Image.open('warehouse4_2.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_wh.append(tk_image) self.cv.warehouse = [] index = 0 for q in self.warehouseCentre: bm = self.cv.bms_wh[index] t = self.cv.create_image(q[0], q[1], image=bm) self.cv.warehouse.append(t) self.AllItemsOrigPosition_list.append(self.cv.coords(t)) index += 1 #arriving picture pil_image = Image.open('warehouse3_car.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.arriveImage.append(tk_image) def create_agentItems(self): self.cv.bms = [] w_box, h_box = self.grid_UNIT, self.grid_UNIT pil_image = Image.open('car9.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms.append(tk_image) pil_image = Image.open('car2.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms.append(tk_image) pil_image = Image.open('car8.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms.append(tk_image) self.cv.car = [] index = 0 for q in self.agentCentre: bm = self.cv.bms[index] t = self.cv.create_image(q[0], q[1], image=bm) self.cv.car.append(t) self.AllItemsOrigPosition_list.append(self.cv.coords(t)) index += 1 def setItemsPositionList(self, itemCentre): npTemp = np.hstack((itemCentre, itemCentre)) h_u = self.grid_UNIT / 2 npHalfUnit = np.array([-h_u, -h_u, h_u, h_u]) hs = npHalfUnit for diam in range(1, itemCentre.shape[0]): hsTemp = np.vstack((npHalfUnit, hs)) hs = hsTemp return (npTemp - hs).tolist() def button_reset(self): time.sleep(self.timeDelay) for line in self.created_line: self.cv.delete(line) self.cv.coords(self.agentA, self.selected_agent_position[0]) self.cv.coords(self.agentB, self.selected_agent_position[1]) def reset(self, agentIndex): """ reset the agent to a random valid location """ if agentIndex == 0: if self.linesA != []: for line in self.linesA: self.cv.delete(line) if self.createMarkA is not None: self.cv.delete(self.createMarkA) if agentIndex == 1: if self.linesB != []: for line in self.linesB: self.cv.delete(line) if self.createMarkB is not None: self.cv.delete(self.createMarkB) if agentIndex != 0 and agentIndex != 1: ex = Exception("agentIndex Error in reset()!") raise ex Obs_list = [[725.0, 515.0], [725.0, 335.0], [635.0, 695.0], [905.0, 245.0], [545.0, 245.0], [995.0, 605.0]] while True: new_loc = [ random.randrange( self.grid_origx_center, self.grid_rowNum * self.grid_UNIT + self.grid_origx_center, self.grid_UNIT), random.randrange( self.grid_origy_center, self.grid_columnNum * self.grid_UNIT + self.grid_origy_center, self.grid_UNIT) ] if new_loc not in Obs_list: break self.cv.coords(self.selected_agent[agentIndex], new_loc) coords = self.cv.coords(self.selected_agent[agentIndex]) return coords def reward_a(self, s_, B_s_, s, s_B): """ rewarding scheme for agentA """ self.targetA = self.selected_targets[0] if s_ == self.cv.coords(self.selected_targets[0]): t = self.cv.create_image(s_, image=self.cv.arriveImage[0]) self.createMarkA = t reward = 1 done = True elif s_ in self.selected_Obstacles_position: reward = -0.75 done = False elif s_ == B_s_: reward = -0.75 done = False elif s_ == s_B and B_s_ == s: reward = -0.75 done = False else: reward = -0.04 done = False return reward, done def reward_b(self, s_, A_s_, s, s_A): """ rewarding scheme for agentB """ self.targetB = self.selected_targets[1] if s_ == self.cv.coords(self.selected_targets[1]): t = self.cv.create_image(s_, image=self.cv.arriveImage[0]) self.createMarkB = t reward = 1 done = True elif s_ in self.selected_Obstacles_position: reward = -0.75 done = False elif s_ == A_s_: reward = -0.75 done = False elif s_ == s_A and A_s_ == s: reward = -0.75 done = False else: reward = -0.04 done = False return reward, done def real_step(self, A_s_, B_s_): self.cv.coords(self.selected_agent[0], A_s_) # move agent self.cv.coords(self.selected_agent[1], B_s_) # move agent return # This is a small utility for printing readable time strings: def format_time(self, seconds): if seconds < 400: s = float(seconds) return "%.1f seconds" % (s, ) elif seconds < 4000: m = seconds / 60.0 return "%.2f minutes" % (m, ) else: h = seconds / 3600.0 return "%.2f hours" % (h, ) def update(self): """ main function for training """ self.RL_A = QLearningTable(actions=list(range(self.n_actions)), e_greedy=self.epsilon) self.RL_B = QLearningTable(actions=list(range(self.n_actions)), e_greedy=self.epsilon) episodeA = 0 episodeB = 0 action_B = -1 action_A = -1 UNIT = self.grid_UNIT stepCountA = 0 stepCountB = 0 total_reward_listA = [] total_reward_listB = [] avg_reward_listA = [] avg_reward_listB = [] win_historyA = [] # history of win/lose game win_historyB = [] # history of win/lose game # initial observation observation_A = self.cv.coords(self.agentA) observation_B = self.cv.coords(self.agentB) visitedA = set() visitedB = set() total_rewardA = 0 total_rewardB = 0 start_time = datetime.datetime.now() while True: self.labelHello = Label(self.cv, text="episodeA: %s" % str(episodeA), font=("Helvetica", 10), width=10, fg="red", bg="white") self.labelHello.place(x=200, y=550, anchor=NW) self.labelHello = Label(self.cv, text="episodeB: %s" % str(episodeB), font=("Helvetica", 10), width=10, fg="blue", bg="white") self.labelHello.place(x=200, y=580, anchor=NW) # fresh env self.render() visitedA.add(tuple(observation_A)) visitedB.add(tuple(observation_B)) stepCountA += 1 stepCountB += 1 distance = (observation_A[0] - observation_B[0])**2 + ( observation_A[1] - observation_B[1])**2 if distance == UNIT**2 or distance == 2 * (UNIT**2): observation_A_new = [] observation_A_new.append(action_B) observation_A_new.append(observation_B) observation_A_new.append(observation_A) observation_B_new = [] observation_B_new.append(action_A) observation_B_new.append(observation_A) observation_B_new.append(observation_B) else: observation_A_new = observation_A observation_B_new = observation_B action_A = self.RL_A.choose_action(str(observation_A_new)) action_B = self.RL_B.choose_action(str(observation_B_new)) A_observation_ = self.calcu_next_state(observation_A, action_A) B_observation_ = self.calcu_next_state(observation_B, action_B) reward_A, done_A = self.reward_a(A_observation_, B_observation_, observation_A, observation_B) reward_B, done_B = self.reward_b(B_observation_, A_observation_, observation_B, observation_A) self.real_step(A_observation_, B_observation_) if tuple(A_observation_) in visitedA: reward_A -= 0.25 if tuple(B_observation_) in visitedB: reward_B -= 0.25 if observation_A == A_observation_: reward_A = reward_A - 0.8 if observation_B == B_observation_: reward_B = reward_B - 0.8 if done_A == True: win_historyA.append(1) if done_B == True: win_historyB.append(1) total_rewardA += reward_A if total_rewardA < -0.5 * self.maze_size: done_A = True win_historyA.append(0) total_rewardB += reward_B if total_rewardB < -0.5 * self.maze_size: done_B = True win_historyB.append(0) distance = (A_observation_[0] - B_observation_[0])**2 + ( A_observation_[1] - B_observation_[1])**2 if distance == UNIT**2 or distance == 2 * (UNIT**2): observation_A_new_ = [] observation_A_new_.append(action_B) observation_A_new_.append(observation_B) observation_A_new_.append(A_observation_) observation_B_new_ = [] observation_B_new_.append(action_A) observation_B_new_.append(observation_A) observation_B_new_.append(B_observation_) else: observation_A_new_ = A_observation_ observation_B_new_ = B_observation_ self.RL_A.learn(str(observation_A_new), action_A, reward_A, str(observation_A_new_)) self.RL_B.learn(str(observation_B_new), action_B, reward_B, str(observation_B_new_)) observation_A = A_observation_ observation_B = B_observation_ # break while loop when end of this episode if done_A: if episodeA > self.episode and episodeB > self.episode: break else: observation_A = self.reset(0) dt = datetime.datetime.now() - start_time t = self.format_time(dt.total_seconds()) total_reward_listA.append(total_rewardA) if len(total_reward_listA) > 100: avg_rewardA = sum(total_reward_listA[-100:]) / 100 avg_reward_listA.append(avg_rewardA) template = "Episode(A): {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | Average rewards: {:.3f} | time: {}" print( template.format( episodeA, self.episode, stepCountA, sum(win_historyA) / len(win_historyA), total_rewardA, avg_rewardA, t)) else: template = "Episode(A): {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | time: {}" print( template.format( episodeA, self.episode, stepCountA, sum(win_historyA) / len(win_historyA), total_rewardA, t)) episodeA += 1 stepCountA = 0 total_rewardA = 0 visitedA = set() done_A = 0 if done_B: if episodeA > self.episode and episodeB > self.episode: break else: observation_B = self.reset(1) dt = datetime.datetime.now() - start_time t = self.format_time(dt.total_seconds()) total_reward_listB.append(total_rewardB) if len(total_reward_listB) > 100: avg_rewardB = sum(total_reward_listB[-100:]) / 100 avg_reward_listB.append(avg_rewardB) template = "Episode(B): {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | Average rewards: {:.3f} | time: {}" print( template.format( episodeB, self.episode, stepCountB, sum(win_historyB) / len(win_historyB), total_rewardB, avg_rewardB, t)) else: template = "Episode(B): {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | time: {}" print( template.format( episodeB, self.episode, stepCountB, sum(win_historyB) / len(win_historyB), total_rewardB, t)) episodeB += 1 stepCountB = 0 total_rewardB = 0 visitedB = set() done_B = 0 # end of game print('game over') self.learning = False self.reset(0) self.reset(1) print("total_time", t) print("total_win_rate_A", sum(win_historyA) / len(win_historyA)) print("average rewards per episode_A", sum(total_reward_listA) / len(total_reward_listA)) print("total_win_rate_B", sum(win_historyB) / len(win_historyB)) print("average rewards per episode_B", sum(total_reward_listB) / len(total_reward_listB)) plt.figure() plt.title('Rewards per Episode') plt.xlabel('Episode number') plt.ylabel('Rewards') plt.plot(total_reward_listA, label='agentA') plt.plot(total_reward_listB, label='agentB') plt.legend(loc='upper right') plt.show() plt.figure() plt.title('Average Rewards over 100 Episode') plt.xlabel('Episode number') plt.ylabel('Rewards') plt.plot(avg_reward_listA, label='agentA') plt.plot(avg_reward_listB, label='agentB') plt.legend(loc='upper right') plt.show() def choose_best_action(self, state, terminal): if terminal == self.cv.coords(self.targetA): q_table = self.q_tableA if terminal == self.cv.coords(self.targetB): q_table = self.q_tableB state_action = q_table.loc[state] action = np.random.choice( state_action[state_action == np.max(state_action)].index) return int(action) def new_reward_a(self, s_, B_s_, s, s_B): # reward function self.targetA = self.selected_targets[0] if s_ == self.cv.coords(self.selected_targets[0]): t = self.cv.create_image(s_, image=self.cv.arriveImage[0]) self.createMarkA = t reward = 0 elif s_ in self.selected_Obstacles_position: reward = -2 elif s_ == B_s_: reward = -2 elif s_ == s_B and B_s_ == s: reward = -2 else: reward = 0 return reward def new_reward_b(self, s_, A_s_, s, s_A): self.targetB = self.selected_targets[1] if s_ == self.cv.coords(self.selected_targets[1]): t = self.cv.create_image(s_, image=self.cv.arriveImage[0]) self.createMarkB = t reward = 0 elif s_ in self.selected_Obstacles_position: reward = -2 elif s_ == A_s_: reward = -2 elif s_ == s_A and A_s_ == s: reward = -2 else: reward = 0 return reward def run(self): """ main function for running tests """ test = 0 rewardsA = [] rewardsB = [] action_B = -1 action_A = -1 UNIT = self.grid_UNIT observation_A = self.cv.coords(self.agentA) observation_B = self.cv.coords(self.agentB) doneA = 0 doneB = 0 total_rewardA = 0 total_rewardB = 0 win_countA = 0 win_countB = 0 terminal_A = self.cv.coords(self.targetA) terminal_B = self.cv.coords(self.targetB) visitedA = [observation_A] visitedB = [observation_B] enhance_list = [] win_listA = [] win_listB = [] while True: if self.cv.coords(self.agentA) == self.cv.coords(self.targetA): doneA = 1 if self.cv.coords(self.agentB) == self.cv.coords(self.targetB): doneB = 1 self.labelHello = Label(self.cv, text="Test:%s" % str(test), font=("Helvetica", 10), width=10, fg="blue", bg="white") self.labelHello.place(x=self.agentCentre[0][0] - 150, y=self.agentCentre[0][1] + 500, anchor=NW) time.sleep(self.timeDelay) distance = (observation_A[0] - observation_B[0])**2 + ( observation_A[1] - observation_B[1])**2 if distance == UNIT**2 or distance == 2 * (UNIT**2): if action_A == -1: observation_B_new = observation_B else: observation_B_new = [] observation_B_new.append(action_A) observation_B_new.append(observation_A) observation_B_new.append(observation_B) if action_B == -1: observation_A_new = observation_A else: observation_A_new = [] observation_A_new.append(action_B) observation_A_new.append(observation_B) observation_A_new.append(observation_A) else: observation_B_new = observation_B observation_A_new = observation_A if doneA != 1: try: action_A = self.choose_best_action(str(observation_A_new), terminal_A) A_observation_ = self.calcu_next_state( observation_A, action_A) if A_observation_ == self.cv.coords(self.targetA): win_listA.append(1) except KeyError: doneA = 1 self.reset(0) pass if doneB != 1: try: action_B = self.choose_best_action(str(observation_B_new), terminal_B) B_observation_ = self.calcu_next_state( observation_B, action_B) if B_observation_ == self.cv.coords(self.targetB): win_listB.append(1) except KeyError: doneB = 1 self.reset(1) pass reward_A = self.new_reward_a(A_observation_, B_observation_, observation_A, observation_B) reward_B = self.new_reward_b(B_observation_, A_observation_, observation_B, observation_A) if B_observation_ in visitedB: reward_B -= 0.5 else: visitedB.append(B_observation_) if A_observation_ in visitedA: reward_A -= 0.5 else: visitedA.append(A_observation_) if doneA: A_observation_ = self.cv.coords(self.targetA) if doneB: B_observation_ = self.cv.coords(self.targetB) self.real_step(A_observation_, B_observation_) total_rewardA += reward_A total_rewardB += reward_B if total_rewardA < -1: doneA = 1 enhance_list.append(visitedA[0]) win_countA += 1 if total_rewardB < -1: doneB = 1 enhance_list.append(visitedB[0]) win_countB += 1 if doneA != 1: lineA = self.cv.create_line( observation_A[0], observation_A[1], A_observation_[0], A_observation_[1], fill='red', arrow=LAST, arrowshape=(10, 20, 8), # 红色 dash=(4, 4) # 虚线 ) self.linesA.append(lineA) if doneB != 1: lineB = self.cv.create_line( observation_B[0], observation_B[1], B_observation_[0], B_observation_[1], fill='blue', arrow=LAST, arrowshape=(10, 20, 8), # 红色 dash=(4, 4) # 虚线 ) self.linesB.append(lineB) observation_A = A_observation_ observation_B = B_observation_ if doneA: action_A = -1 visitedA = [] if doneB: action_B = -1 visitedB = [] if doneA and doneB: total_rewardA += 1 total_rewardB += 1 rewardsA.append(total_rewardA) rewardsB.append(total_rewardB) total_rewardA = 0 total_rewardB = 0 self.reset(0) self.reset(1) doneA = 0 doneB = 0 observation_A = self.cv.coords(self.agentA) observation_B = self.cv.coords(self.agentB) test += 1 if test > self.tests: break plt.figure() plt.title('Score per Episode') plt.xlabel('Episode number') plt.ylabel('Score') plt.plot(rewardsA, label='agentA') plt.plot(rewardsB, label='agentB') plt.legend(loc='upper right') # print(rewardsA) # print(rewardsB) plt.show() print("win_countA:", sum(win_listA)) print("win_countB:", sum(win_listB)) def start_learning(self): self.selected_agent = [] self.selected_targets = [] self.selected_Obstacles = [] self.selected_agent_position = [] self.selected_Obstacles_position = [] for item in range(1, self.itemsNum + 1): p = self.cv.coords(item) if p[0]>=self.grid_origx and p[1]>=self.grid_origy \ and p[0]<=self.grid_endx and p[1]<=self.grid_endy: if item in range(self.agentItemIndex[0], self.agentItemIndex[1] + 1): self.selected_agent.append(item) self.selected_agent_position.append(p) elif item in range(self.WarehouseItemIndex[0], self.WarehouseItemIndex[1] + 1): self.selected_targets.append(item) elif item in range(self.ObstacleItemIndex[0], self.ObstacleItemIndex[1] + 1): self.selected_Obstacles.append(item) self.selected_Obstacles_position.append(p) self.agentA = self.selected_agent[0] self.agentB = self.selected_agent[1] self.targetA = self.selected_targets[0] self.targetB = self.selected_targets[1] if len(self.selected_agent) == 0 or len(self.selected_agent) > 2: tkinter.messagebox.showinfo( "INFO", "Please choose TWO agent for trainning!") elif len(self.selected_targets) == 0 or len(self.selected_targets) > 2: tkinter.messagebox.showinfo( "INFO", "Please choose TWO target for trainning!") else: self.t = threading.Timer(self.timeDelay, self.update) self.t.start() self.learning = True def restart(self): self.cv.coords(self.agentA, self.agentCentre[0]) self.cv.coords(self.agentB, self.agentCentre[1]) def calcu_next_state(self, loc, action): """ calculate next state based on location and action """ UNIT = self.grid_UNIT ss = loc np_s = np.array(ss) dissS = np.array([self.grid_origx, self.grid_origy]) s = (np_s - dissS).tolist() base_action = np.array([0, 0]) if action == 0: # up if s[1] > UNIT: base_action[1] -= UNIT elif action == 1: # down if s[1] < (self.grid_rowNum - 1) * UNIT: base_action[1] += UNIT elif action == 2: # right if s[0] < (self.grid_columnNum - 1) * UNIT: base_action[0] += UNIT elif action == 3: # left if s[0] > UNIT: base_action[0] -= UNIT s_ = [] s_ = [ss[0] + base_action[0], ss[1] + base_action[1]] return s_ def render(self): time.sleep(self.timeDelay) def create_items(self): self.AllItemsOrigPosition_list.append([0, 0, 0, 0]) self.create_agentItems() self.agentItemIndex = [1, len(self.agentPosition_list)] self.create_targetItems() self.WarehouseItemIndex = [ self.agentItemIndex[1] + 1, self.agentItemIndex[1] + len(self.warehousePostition_list) ] self.create_ObsItems() self.ObstacleItemIndex = [ self.WarehouseItemIndex[1] + 1, self.WarehouseItemIndex[1] + len(self.ObstaclePosition_list) ] def create_grids(self, origx, origy, column, row, UNIT): # create grids for c in range(origx, origx + (column + 1) * UNIT, UNIT): x0, y0, x1, y1 = c, origy, c, origy + row * UNIT self.cv.create_line(x0, y0, x1, y1, width=2) for r in range(origy, origy + (row + 1) * UNIT, UNIT): x0, y0, x1, y1 = origx, r, origx + row * UNIT, r self.cv.create_line(x0, y0, x1, y1, width=2) def choose_type(self, i): """ function of clicking different button """ for b in self.bns: b['relief'] = RAISED self.bns[i]['relief'] = SUNKEN self.item_type = i if self.item_type == 1: # start training self.start_learning() self.bns[i]['relief'] = RAISED elif self.item_type == 2: # close simulation tool os._exit(0) elif self.item_type == 3: # save q_table temp_s = str(self.cv.coords(self.target)) + str( self.selected_Obstacles_position) self.RL.q_table.to_csv("single_qtable_%s.csv" % temp_s, index_label="index_label") print("SAVED!!!") self.labelHello = Label(self.cv, text="table saved!!", font=("Helvetica", 10), width=10, fg="red", bg="white") self.labelHello.place(x=350, y=750, anchor=NW) elif self.item_type == 0: self.button_reset() elif self.item_type == 4: # start running tests self.start_running() elif self.item_type == 5: self.restart() def start_running(self): self.selected_agent = [] self.selected_targets = [] self.selected_Obstacles = [] self.selected_agent_position = [] self.selected_Obstacles_position = [] self.task_list = [] self.task_num_list = [] for item in range(1, self.itemsNum + 1): p = self.cv.coords(item) if p[0] >= self.grid_origx and p[1] >= self.grid_origy: if item in range(self.agentItemIndex[0], self.agentItemIndex[1] + 1): self.selected_agent.append(item) self.selected_agent_position.append(p) elif item in range(self.WarehouseItemIndex[0], self.WarehouseItemIndex[1] + 1): self.selected_targets.append(item) elif item in range(self.ObstacleItemIndex[0], self.ObstacleItemIndex[1] + 1): self.selected_Obstacles.append(item) self.selected_Obstacles_position.append(p) if len(self.selected_agent) <= 1 or len(self.selected_agent) > 2: tkinter.messagebox.showinfo("Please place TWO agent on map!") elif len(self.selected_targets) == 0 or len(self.selected_targets) > 2: tkinter.messagebox.showinfo("Please choose TWO terminal!") else: self.agentA = self.selected_agent[0] self.agentB = self.selected_agent[1] self.targetA = self.selected_targets[0] self.targetB = self.selected_targets[1] terminalA = self.cv.coords(self.targetA) terminalB = self.cv.coords(self.targetB) terminal_strA = str(terminalA) + str(self.episode) terminal_strB = str(terminalB) + str(self.episode) self.task_list = [] self.q_tableA = pd.read_csv("table terminal%s.csv" % terminal_strA, index_col=0) self.q_tableB = pd.read_csv("table terminal%s.csv" % terminal_strB, index_col=0) self.labelHello = Label(self.cv, text="start running!!", font=("Helvetica", 10), width=10, fg="red", bg="white") self.labelHello.place(x=250, y=750, anchor=NW) self.t = threading.Timer(self.timeDelay, self.run) self.t.start() self.learning = True def rightClick_handler(self, event): self.start_learning() def leftClick_handler(self, event): if self.learning: print("Learing on going!") else: for item in range(1, self.itemsNum + 1): position = self.cv.coords(item) R = self.grid_UNIT / 2 p = [ position[0] - R, position[1] - R, position[0] + R, position[1] + R ] if event.x>=p[0] and event.x<=p[2] and \ event.y>=p[1] and event.y<=p[3]: t = item self.choose_item_handler(event, t) def choose_item_handler(self, event, t): self.choose_item = t self.itemOrigPosition = self.cv.coords(t) def move(self, event): if self.choose_item is not None: t = self.choose_item self.cv.coords(t, event.x, event.y) def adjust_items_into_grids(self, event): if self.choose_item is not None: t = self.choose_item position = self.cv.coords(t) centerX = position[0] centerY = position[1] Grids_X0 = self.grid_origx Grids_X1 = self.grid_origx + (self.grid_columnNum + 1) * self.grid_UNIT Grids_Y0 = self.grid_origy Grids_Y1 = self.grid_origy + (self.grid_rowNum + 1) * self.grid_UNIT if (centerX in range(Grids_X0, Grids_X1)) and (centerY in range( Grids_Y0, Grids_Y1)): columnIndex = math.floor((centerX - Grids_X0) / self.grid_UNIT) rowIndex = math.floor((centerY - Grids_Y0) / self.grid_UNIT) adjustedX0 = Grids_X0 + columnIndex * self.grid_UNIT + self.grid_UNIT / 2 adjustedY0 = Grids_Y0 + rowIndex * self.grid_UNIT + self.grid_UNIT / 2 self.cv.coords(t, adjustedX0, adjustedY0) else: self.cv.coords(t, self.AllItemsOrigPosition_list[t]) self.itemOrigPosition = [] def move_end(self, event): if self.choose_item is not None: t = self.choose_item self.adjust_items_into_grids(event) self.choose_item = None def delete_item(self, event): # 如果被选中的item不为空,删除被选中的图形项 if self.choose_item is not None: self.cv.delete(self.choose_item)