def calTime(filename): """ 使用Q表来计算完成所有任务消耗的时间 :param filename: :return: """ task = createTask() env = Maze(task) RL = QLearningTable(actions=list(range(env.n_actions)), filename=filename) Time1 = [] # Time2 = [] for i in range(10000): observation = env.reset() while True: action = RL.choose_action_real(str(observation)) observation_, reward, done = env.step(action) # print(observation,action,reward) observation = observation_ if done: time1 = findmax(task) # time2 = calOmegaT(task,np.array([255])[0]) Time1.append(time1) # Time2.append(time2) break # print(np.mean(Time1)) # print(np.mean(Time2)) return np.mean(Time1)
def main(): trained_number = getLastExperiment('p5i3g0') RL = QLearningTable(list(range(len(green_states)))) trained_path = '{}/results/{}/'.format(WORKSPACE, trained_number) qtable_path = trained_path + 'qtable.csv' RL.feedQTable(qtable_path) RL.epsilon = 1 fixed,rl,actuated = testAgent('fixed', RL), testAgent('rl', RL), testAgent('actuated', RL) plotTestResult(rl, fixed, actuated, trained_path)
def main(): # --------------preparation-------------------- rst_path, sim_path = generatePath( current_time) # Create a new folder for the experiment RL = QLearningTable(list(range( len(green_states)))) # Initialize the Q-learning framework feed_path = '{}/results/{}/qtable.csv'.format(WORKSPACE, 'p5i3g0') RL.feedQTable( feed_path ) # This could be helpful when inheriting from previous trained agent # ---------------training-------------------- trainAgent(RL, rst_path, sim_path) # --------------testing-------------------- RL.epsilon = 1 # Epsilon-greedy no longer selects random actions fixed, rl, actuated = testAgent('fixed', RL), testAgent('rl', RL), testAgent( 'actuated', RL) plotTestResult(rl, fixed, actuated, sim_path) flow_scenarios = ['-50%', '-25%', '0%', '+25%', '+50%'] pushAgent(flow_scenarios, sim_path, RL) # Explore the limit of the trained agent # --------------results---------------------- RL.saveQTable('{}/qtable.csv'.format(sim_path)) RL.plotCumulativeReward(sim_path) # Plot the cumulative reward RL_params = { 'lr': RL.alpha, 'gamma': RL.gamma, 'e_max': RL.e_greedy_max, 'e_inc': RL.e_greedy_increment } writeLog(RL_params, rst_path, sim_path, clean=True) # Record some basic information of the experiment # --------------end-------------------- print('\nALL DONE, check {}'.format(str(current_time)))
def stacking_assign_q_learning(shorter_init, longer_init): env = Stacking(shorter_init, longer_init) RL = QLearningTable(actions=list(range(6)), e_greedy=1) if shorter_init[0] == 'A' and longer_init[0] == 'U': RL.q_table = RL.q_table.append(q_table_A_U) elif shorter_init[0] == 'C' and longer_init[0] == 'G': RL.q_table = RL.q_table.append(q_table_C_G) elif shorter_init[0] == 'G' and longer_init[0] == 'C': RL.q_table = RL.q_table.append(q_table_G_C) elif shorter_init[0] == 'G' and longer_init[0] == 'U': RL.q_table = RL.q_table.append(q_table_G_U) elif shorter_init[0] == 'U' and longer_init[0] == 'A': RL.q_table = RL.q_table.append(q_table_U_A) elif shorter_init[0] == 'U' and longer_init[0] == 'G': RL.q_table = RL.q_table.append(q_table_U_G) observation = env.shorter + "_" + env.longer while True: action = RL.choose_action(observation) shorter_, longer_, reward, done = env.step(action) observation_ = shorter_ + "_" + longer_ # RL.learn(str(observation), action, reward, str(observation_)) observation = observation_ if done: break shorter_final = observation.split('_')[0] longer_final = observation.split('_')[1] return shorter_final, longer_final
def ubp_4_assign_q_learning(shorter_init): env = ubp_4(shorter_init) RL = QLearningTable(actions=list(range(4)), e_greedy=1) RL.q_table = RL.q_table.append(q_table_ubp_4) observation = env.shorter while True: action = RL.choose_action(observation) shorter_, reward, done = env.step(action) observation_ = shorter_ # RL.learn(str(observation), action, reward, str(observation_)) observation = observation_ if done: break return observation
def rl(self): RL = QLearningTable(actions=list(range(3)), learning_rate=self.learning_rate, reward_decay=self.reward_decay, e_greedy=self.e_greedy) RL = self.train(self.D[:self.N], self.P[:self.N], RL) level = np.array([10, 10000, 10000]) n_interval = int(self.T / self.I) cost_rl = np.zeros(n_interval) for n in range(1, n_interval + 1): a_real = self.D[self.N + self.V + (n - 1) * self.I:self.N + self.V + n * self.I] r_real = self.R[self.N + self.V + (n - 1) * self.I:self.N + self.V + n * self.I] p_real = self.P[self.N + self.V + (n - 1) * self.I:self.N + self.V + n * self.I] d_real = (a_real - r_real) d_real = d_real.astype(int) d_real = d_real.reshape(len(d_real)) p_real = p_real.reshape(len(p_real)) s = 0 step = 0 pbar = p_real[0] observation = np.array([p_real[0], d_real[0], s]) while True: temp_ob = observation.copy() / level temp_ob = temp_ob.astype(int) action = RL.choose_action(str(temp_ob)) observation_, reward, done, pbar, stepcost, sl, cd, dd, gd = self.stepto( action, observation, step, p_real, pbar, d_real) if observation_ == 'terminal': cost_rl[n - 1] = cost_rl[n - 1] + stepcost break else: cost_rl[n - 1] = cost_rl[n - 1] + stepcost observation = observation_ step = step + 1 if step >= self.I: break cost_rl_copy = cost_rl.copy() for i in range(len(cost_rl_copy)): cost_rl[i] = sum(cost_rl_copy[:i + 1]) return cost_rl
def pathplanning(self): global root global view RL = QLearningTable(actions=list(range(self.n_actions)), learning_rate=self._learningrate, reward_decay=self._discountfactor, e_greedy=self._egreedy) # update qtable self.currentqtable = str(RL.q_table) for episode in range(self._maxepisode): # update episode self.currentepisode = episode + 1 # reset self._robot = self._start.copy() # initialize observation observation = str(self._robot) time.sleep(1) while True: # record the final path if (episode == self._maxepisode - 1): self.finalpath.append( str("(" + str(int(self._robot[0])) + "," + str(int(self._robot[1])) + ")")) # choose action action = RL.choose_action(observation) # get new observation next_observation, reward, done = self.step(action) # learn from this observation RL.learn(observation, action, reward, next_observation) # update observation observation = next_observation # update qtable self.currentqtable = str(RL.q_table) # sleep for qml's update time.sleep(0.2) # print("#######") if done: break # print(self.finalpath) self.isfinalpath = True
def main(): env = Maze() RL = QLearningTable(actions=list(range(env.n_actions))) for episode in range(100): if episode % 200 == 0: RL.save_q_table() # initial observation observation = env.reset() counter = 0 while True: # fresh env env.render() print("Round: " + str(counter)) # RL choose action based on observation action = RL.choose_action(observation) # RL take action and get next observation & reward observation_, reward, done = env.step(action) # RL learn from this transition RL.learn(observation, action, reward, observation_, done) # swap observation observation = observation_ # break while loop when end of this episode if done: # RL.save_q_table() break else: time.sleep(1) counter += 1 # end game print("end game") # save q_table RL.save_q_table()
y1.append(item[1]) y2.append(item[2]) plt.subplots() plt.title(key + " " + str(episode)) plt.plot(x, y1, label="max") plt.plot(x, y2, label="opt") plt.legend() plt.savefig(dir + "/" + str(episode) + "/" + key + " " + str(episode) + ".png") # plt.show() plt.close() task_num = [ 3, ] for taskNum in task_num: parameter["taskNum"] = taskNum from task import * task = createTask() # Q-learning env = Maze(task) RL = QLearningTable( actions=list(range(env.n_actions)), filename= "/home/zongwangz/PycharmProjects/q_learning/Figure1/Q_learning Table100_3" ) update(env, RL, 1100000) RL.q_table.to_csv("Q_learning Table" + str(taskNum) + "_right")
# end of game # print(RL.q_table) new_table = pd.DataFrame(dtype=np.float64) for i in RL.q_table._stat_axis.values.tolist(): temp_list = i[1:-1].split(',') if (len(temp_list) >= 4): ycor = ( (float(temp_list[1]) + float(temp_list[3])) / 2 - 20) / 40 + 1 xcor = ( (float(temp_list[0]) + float(temp_list[2])) / 2 - 20) / 40 + 1 new_table = new_table.append( pd.Series( RL.q_table.loc[i, :], index=RL.q_table.columns, name='({},{})'.format(int(xcor), int(ycor)), )) print(new_table) print('game over') env.destroy() if __name__ == "__main__": env = Maze() RL = QLearningTable() #100ms调用update一次 env.after(1, update) env.mainloop()
if done: break print(state_max, state_min) return state_max, state_min def discretize_state(state): discrete_num = 10 state_dim = env.observation_space.shape[0] dis_state = np.ones(state_dim) for i in range(state_dim): state_range = env.observation_space.high[i] - env.observation_space.low[i] # check if range is inf if state_range > 1000000: if state[i] > state_max[i]: dis_state[i] = int((state_max[i]-state_min[i])/((state_max[i]-state_min[i])/discrete_num)) if state[i] < state_min[i]: dis_state[i] = int((state_max[i]-state_min[i])/((state_max[i]-state_min[i])/discrete_num)) dis_state[i] = int((state[i]-state_min[i])/((state_max[i]-state_min[i])/discrete_num)) else: dis_state[i] = int((state[i]-env.observation_space.low[i])/((env.observation_space.high[i]-env.observation_space.low[i])/discrete_num)) return dis_state if __name__ == "__main__": env = gym.make('CartPole-v0') RL = QLearningTable(actions=list(range(env.action_space.n))) state_max, state_min = get_range() update()
def update(): start = time.time() RL = QLearningTable(n_states=nodes_num, each_services_nums=each_services_nums, max_services_num=max_services_num, nodeSet_file=nodeSet_file, conf_file=conf_file, learning_rate=ALPHA, reward_decay=GAMMA, e_greedy=EPSILON) max_reward = 0 for episode in range(MAX_EPISODES): # initial observation state = 0 # print("episode = {}".format(episode)) while True: # RL choose action based on observation action = RL.choose_action(state) # RL take action and get next observation and reward state_, reward, done = RL.step(state, action) # print("s = {0}, a = {1}, s_ = {2}, reward = {3}".format( # state, action, state_, reward # )) # RL learn from this transition RL.learn(state, action, reward, state_) # swap observation state = state_ # break while loop when end of this episode if done: # print("services = {0}, reward = {1}, runtime = {2}, episode = {3} ".format # (RL.choose_services, reward, time.time()-start, episode)) if episode == 0: max_reward = reward else: if reward > max_reward: max_reward = reward print( "services = {0}, reward = {1}, runtime = {2}, episode = {3} " .format(RL.choose_services, reward, time.time() - start, episode)) line = [x for x in RL.choose_services] line.append(reward) line.append(time.time() - start) line.append(episode) # print(line) fp = open(outfile, 'a+') fp.write(str(line) + '\n') fp.close() else: if episode % 100 == 0: print("episode = {}".format(episode)) break # 终止条件 if episode >= ERROR_COUNT: del judge_list[0] judge_list.append(reward) if episode >= 1000 and episode % ERROR_COUNT == 0: if max(judge_list) - min(judge_list) <= ERROR_RANGE: output = "\n 达到收敛条件,提前终止实验!\n" line = [x for x in RL.choose_services] line.append(reward) line.append(time.time() - start) line.append(episode) # 打印收敛结果 print(output) print(line) # 记录收敛结果 fp = open(outfile, 'a+') fp.write(output) fp.write(str(line) + '\n') fp.close() break print('game over')
RL.learn(str(s), action, reward, str(s_), Text, tot_action, len(granul)) #time.sleep(3) #print(s, action, reward, s_) s = s_ # break while loop when end of this episode if done == True: break #print("Episode Over") # end of game ''' print('Game Over, Best Reward Ever:', "%.2f%%" % Perc_Best, Text) End_Time = datetime.datetime.now().strftime('%m-%d %H:%M') print("Started Time: " + Start_Real_Time + ", End Time: " + End_Time) rp.plot_legend_text(Time_Best, Light_Best, Light_Feed_Best, Action_Best, r_Best, perf_Best, SC_Best, SC_Best_norm_hist, SC_Feed_Best, Occup_Best, Text, best_reward, tot_episodes) rp.plot_reward_text(Tot_Episodes, Tot_Reward, Text, best_reward, tot_episodes) ''' if __name__ == "__main__": #env = Maze() #RL = QLearningTable(actions=list(range(env.n_actions))) RL = QLearningTable(actions=list(range(tot_action))) update() #env.after(100, update) #env.mainloop()
# RL take action and get next observation and reward observation_, reward, done = env.step( observation, eval(action)) RL.learn(str(observation), str(action), reward, str(observation_)) # swap observation observation = observation_ # # break while loop when end of this episode if done: isRunning = False print(episode, len(RL.q_table.index)) break # RL learn from this transition except KeyboardInterrupt: # RL.q_table.to_pickle("./Data/dataframe.pk1") sys.exit() # end of game print('game over') env.destroy() if __name__ == "__main__": env = Maze() RL = QLearningTable(actions=env.action_space) env.after(1, update) env.mainloop()
import gym from RL_brain import QLearningTable env = gym.make('MountainCar-v0') env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = QLearningTable(actions=list(range(3))) total_steps = 0 for i_episode in range(10): observation = env.reset() ep_r = 0 while True: env.render() action = RL.choose_action(str(observation)) observation_, reward, done, info = env.step(action) position, velocity = observation_ # the higher the better reward = abs(position - (-0.5)) # r in [0, 1] RL.learn(str(observation), action, reward, str(observation_)) ep_r += reward if done: get = '| Get' if observation_[ 0] >= env.unwrapped.goal_position else '| ----' print('Epi: ', i_episode, get, '| Ep_r: ', round(ep_r, 4), '| Epsilon: ', round(RL.epsilon, 2))
cost.append(cost_) density.append(density_) if find_target_node_: num_find_target += 1 opt_cost.append(opt_cost_) return cost, density, num_find_target, opt_cost if __name__ == "__main__": # r = input('times: ') r = '50000' save_list = [100, 50000] # ,10000,50000,100000,200000,300000,400000,500000,600000,700000,800000,900000,1000000 train = True env = envR(show=False) RL = QLearningTable(env.action_space, learning_rate=0.1) # step = 0 # succ = 0 # start = time.time() for episode in range(int(r)): pre_maps = env.reset() for i in range(100): action = RL.choose_action(str(pre_maps), train) reward, done, action_ = env.step(action) RL.learn(str(pre_maps), action, reward, str(env.get_maps()), done) pre_maps = env.get_maps()
# fresh env env.render() # RL choose action based on observation action = RL.choose_action(str(observation)) # RL take action and get next observation and reward observation_, reward, done = env.step(action) # RL learn from this transition RL.learn(str(observation), action, reward, str(observation_)) # swap observation observation = observation_ # break while loop when end of this episode if done: #保存q_table时使用 RL.save_q_table() break # end of game print('game over') print(RL.q_table) env.destroy() if __name__ == "__main__": env = Maze() RL = QLearningTable(actions=list(range(env.n_actions)),read_save=False) env.after(100, update) env.mainloop()
## en.rfcomm5() ## rfcomm_0.start() ## rfcomm_1.start() ## rfcomm_2.start() ## rfcomm_3.start() ## rfcomm_4.start() ## rfcomm_5.start( ## rfcomm_0.join() ## rfcomm_1.join() ## rfcomm_2.join() ## rfcomm_3.join() ## rfcomm_4.join() ## rfcomm_5.join() ## print('time :',time.time() - s_t) env = Maze() RL = QLearningTable(actions=list(range(env.n_actions))) RL1 = QLearningTable(actions=list(range(env.n_actions))) RL2 = QLearningTable(actions=list(range(env.n_actions))) RL3 = QLearningTable(actions=list(range(env.n_actions))) RL4 = QLearningTable(actions=list(range(env.n_actions))) RL5 = QLearningTable(actions=list(range(env.n_actions))) #Thread_1 = threading.Thread(target = updeee) #env.mainloop() #env.after(100, update) #env.after(100, update1) #Thread_0.start() #Thread_1.start() #env.mainloop() #Thread_0.join() #Thread_1.join() env.after(100, update)
# fresh env env.render() # RL choose action based on observation action = RL.choose_action(str(observation)) # RL take action and get next observation and reward observation_, reward, done = env.step(action) # RL learn from this transition RL.learn(str(observation), action, reward, str(observation_)) # swap observation observation = observation_ # break while loop when end of this episode if done: break # end of game print('game over') env.destroy() if __name__ == "__main__": env = Maze() RL = QLearningTable(actions=list(range(env.n_actions))) env.after(100, update) env.mainloop() RL.print_q_table()
class App: def __init__(self, master): self.master = master # grid map setting self.grid_origx = 500 self.grid_origy = 20 self.grid_columnNum = 8 self.grid_rowNum = 8 self.grid_UNIT = 90 self.maze_size = self.grid_columnNum * self.grid_rowNum # define total training episodes self.episode = 1000 # define number of tests to run self.tests = 100 # set a small amount of delay (second) to make sure tkinter works properly # if want to have a slower visulazation for testing, set the delay to larger values self.timeDelay = 0.005 # other initialization self.n_actions = 4 self.outline = 'black' self.fill = None self.item_type = 0 self.learning = False self.itemsNum = 0 self.epsilon = 0.9 self.Qtable_origx = self.grid_origx + 20 + (self.grid_columnNum + 1) * self.grid_UNIT self.Qtable_origy = self.grid_origy self.grid_origx_center = self.grid_origx + self.grid_UNIT / 2 self.grid_origy_center = self.grid_origy + self.grid_UNIT / 2 self.Qtable_gridIndex_dict = {} self.show_q_table = pd.DataFrame(columns=list(range(self.n_actions)), dtype=np.float64) self.origDist = 10 self.agentCentre = np.array([[190, 180], [290, 180], [390, 180]]) self.warehouseCentre = self.agentCentre+np.array([[0,self.grid_UNIT+self.origDist],\ [0,self.grid_UNIT+self.origDist],[0,self.grid_UNIT+self.origDist]]) self.ObstacleCentre1 = np.array([[725, 515], [725, 335], [635, 695]]) self.ObstacleCentre2 = np.array([[905, 245], [545, 245], [995, 605]]) self.itemOrigPosition = [] self.agentPosition_list = [] self.warehousePostition_list = [] self.ObstaclePosition_list = [] self.WarehouseItemIndex = [] self.agentItemIndex = [] self.ObstacleItemIndex = [] self.AllItemsOrigPosition_list = [] self.createMark = None self.points = [] self.cars_list = [] self.selected_agent = [] self.selected_agent_position = [] self.selected_Obstacles_position = [] self.selected_Obstacles = [] self.selected_targets = [] self.agent = 1 self.target = 4 self.hell1 = 7 self.hell2 = 8 self.init_widgets() self.temp_item = None self.temp_items = [] self.choose_item = None self.created_line = [] self.lines = [] def resize(self, w, h, w_box, h_box, pil_image): ''''' resize a pil_image ''' return pil_image.resize((w_box, h_box), Image.ANTIALIAS) def init_widgets(self): self.cv = Canvas(root, background='white') self.cv.pack(fill=BOTH, expand=True) # bind events of dragging with mouse self.cv.bind('<B1-Motion>', self.move) self.cv.bind('<ButtonRelease-1>', self.move_end) self.cv.bind("<Button-1>", self.leftClick_handler) # bind events of double-left-click self.cv.bind("<Button-3>", self.rightClick_handler) f = ttk.Frame(self.master) f.pack(fill=X) self.bns = [] # initialize buttons for i, lb in enumerate( ('Reset', 'Start trainning', 'Close', 'Save', 'Start Running')): bn = Button(f, text=lb, command=lambda i=i: self.choose_type(i)) bn.pack(side=LEFT, ipadx=8, ipady=5, padx=5) self.bns.append(bn) self.bns[self.item_type]['relief'] = SUNKEN #initialize agent, warehouses and obstacles positions self.agentPosition_list = self.setItemsPositionList(self.agentCentre) self.warehousePostition_list = self.setItemsPositionList( self.warehouseCentre) self.ObstaclePosition_list1 = self.setItemsPositionList( self.ObstacleCentre1) self.ObstaclePosition_list2 = self.setItemsPositionList( self.ObstacleCentre1) self.ObstaclePosition_list = self.ObstaclePosition_list1 + self.ObstaclePosition_list2 self.create_items() self.itemsNum = self.warehouseCentre.shape[ 0] + self.ObstacleCentre1.shape[0] + self.ObstacleCentre2.shape[ 0] + self.agentCentre.shape[0] R = self.grid_UNIT self.cv.create_text(self.agentCentre[0][0]-R-20,self.agentCentre[0][1],\ text = "Agent:",font=('Courier',18)) self.cv.create_text(self.warehouseCentre[0][0]-R-20,self.warehouseCentre[0][1],\ text = "Warehouse:",font=('Couried',18)) self.cv.create_text(self.grid_origx+250,self.grid_origy-50, text = "Single agent Q-Learning Simulation",\ font=('Times',38),fill = 'red') self.cv.create_text(self.grid_origx+252,self.grid_origy-52, text = "Single agent Q-Learning Simulation",\ font=('Times',38),fill = 'green') #draw grids self.create_grids(self.grid_origx, self.grid_origy, self.grid_columnNum, self.grid_rowNum, self.grid_UNIT) for i in range(0, self.grid_rowNum): for j in range(0, self.grid_columnNum): x = i * self.grid_UNIT + self.grid_origx_center y = j * self.grid_UNIT + self.grid_origy_center rowIndex = (y - self.grid_origy_center) / self.grid_UNIT columnIndex = (x - self.grid_origx_center) / self.grid_UNIT self.Qtable_gridIndex_dict[( x, y)] = rowIndex * self.grid_columnNum + columnIndex print(self.Qtable_gridIndex_dict) def create_ObsItems(self): self.cv.arriveObsImage = [] self.cv.bms_obs = [] w_box, h_box = self.grid_UNIT, self.grid_UNIT pil_image = Image.open('obs5.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image1 = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_obs.append(tk_image1) pil_image = Image.open('obs7.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image2 = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_obs.append(tk_image2) pil_image = Image.open('obs8.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image3 = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_obs.append(tk_image3) self.cv.bms_obs.append(tk_image1) self.cv.bms_obs.append(tk_image2) self.cv.bms_obs.append(tk_image3) self.cv.Obstacle = [] index = 0 for q in self.ObstacleCentre1: bm = self.cv.bms_obs[index] t = self.cv.create_image(q[0], q[1], image=bm) self.cv.Obstacle.append(t) self.AllItemsOrigPosition_list.append(self.cv.coords(t)) index += 1 for q in self.ObstacleCentre2: bm = self.cv.bms_obs[index] t = self.cv.create_image(q[0], q[1], image=bm) self.cv.Obstacle.append(t) self.AllItemsOrigPosition_list.append(self.cv.coords(t)) index += 1 #arriving picture pil_image = Image.open('obs5_car.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.arriveObsImage.append(tk_image) def create_targetItems(self): self.cv.arriveImage = [] self.cv.bms_wh = [] w_box, h_box = self.grid_UNIT, self.grid_UNIT pil_image = Image.open('warehouse4_1.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_wh.append(tk_image) pil_image = Image.open('warehouse3.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_wh.append(tk_image) pil_image = Image.open('warehouse4_2.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms_wh.append(tk_image) self.cv.warehouse = [] index = 0 for q in self.warehouseCentre: bm = self.cv.bms_wh[index] t = self.cv.create_image(q[0], q[1], image=bm) self.cv.warehouse.append(t) self.AllItemsOrigPosition_list.append(self.cv.coords(t)) index += 1 #arriving picture pil_image = Image.open('warehouse3_car.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.arriveImage.append(tk_image) def create_agentItems(self): self.cv.bms = [] w_box, h_box = self.grid_UNIT, self.grid_UNIT pil_image = Image.open('car9.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms.append(tk_image) pil_image = Image.open('car2.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms.append(tk_image) pil_image = Image.open('car8.jpg') w, h = pil_image.size pil_image_resized = self.resize(w, h, w_box, h_box, pil_image) tk_image = ImageTk.PhotoImage(pil_image_resized) self.cv.bms.append(tk_image) self.cv.car = [] index = 0 for q in self.agentCentre: bm = self.cv.bms[index] t = self.cv.create_image(q[0], q[1], image=bm) self.cv.car.append(t) self.AllItemsOrigPosition_list.append(self.cv.coords(t)) index += 1 def setItemsPositionList(self, itemCentre): npTemp = np.hstack((itemCentre, itemCentre)) # print("npTemp=",npTemp) h_u = self.grid_UNIT / 2 npHalfUnit = np.array([-h_u, -h_u, h_u, h_u]) hs = npHalfUnit for diam in range(1, itemCentre.shape[0]): hsTemp = np.vstack((npHalfUnit, hs)) hs = hsTemp # print("hs=",hs) return (npTemp - hs).tolist() def button_reset(self): time.sleep(self.timeDelay) if self.createMark is not None: self.cv.delete(self.createMark) for line in self.created_line: self.cv.delete(line) self.cv.coords(self.agent, self.selected_agent_position) coords = self.cv.coords(self.agent) return coords def reset(self): """ reset the agent to a random valid location """ if self.lines != []: for line in self.lines: self.cv.delete(line) Obs_list = self.ObstaclePosition_list while True: new_loc = [ random.randrange( self.grid_origx_center, self.grid_rowNum * self.grid_UNIT + self.grid_origx_center, self.grid_UNIT), random.randrange( self.grid_origy_center, self.grid_columnNum * self.grid_UNIT + self.grid_origy_center, self.grid_UNIT) ] if new_loc not in Obs_list: break self.cv.coords(self.selected_agent[0], new_loc) coords = self.cv.coords(self.selected_agent[0]) return coords def choose_best_action(self, state, terminal): """ choose best action from Q_table """ if terminal == self.cv.coords(self.target): q_table = self.q_table state_action = q_table.loc[state] action = np.random.choice( state_action[state_action == np.max(state_action)].index) return int(action) def run(self): """ main function for runing tests """ test = 0 rewards = [] action = -1 observation = self.cv.coords(self.agent) done = 0 total_reward = 0 terminal = self.cv.coords(self.target) visited = [observation] # enhance_list = [] win_count = 0 while True: self.labelHello = Label(self.cv, text="Test:%s" % str(test), font=("Helvetica", 10), width=10, fg="blue", bg="white") self.labelHello.place(x=self.agentCentre[0][0] - 150, y=self.agentCentre[0][1] + 500, anchor=NW) time.sleep(self.timeDelay) action = self.choose_best_action(str(observation), terminal) observation_ = self.calcu_next_state(observation, action) reward = self.new_reward(observation_, observation) if observation_ in visited: reward -= 0.5 else: visited.append(observation_) if done: observation_ = self.cv.coords(self.target) self.cv.coords(self.selected_agent[0], observation_) total_reward += reward if total_reward < -1: done = 1 if done != 1: line = self.cv.create_line( observation[0], observation[1], observation_[0], observation_[1], fill='red', arrow=LAST, arrowshape=(10, 20, 8), # 红色 dash=(4, 4) # 虚线 ) self.lines.append(line) observation = observation_ if self.cv.coords(self.agent) == self.cv.coords(self.target): done = 1 if done: action = -1 visited = [] total_reward += 1 if total_reward == 1: win_count += 1 rewards.append(total_reward) total_reward = 0 self.reset() done = 0 observation = self.cv.coords(self.agent) test += 1 if test > self.tests: self.labelHello = Label(self.cv, text="running end!!", font=("Helvetica", 10), width=10, fg="red", bg="white") self.labelHello.place(x=250, y=750, anchor=NW) break print("win_count", win_count) plt.figure() plt.title('Score per Episode') plt.xlabel('Episode number') plt.ylabel('Score') plt.plot(rewards) plt.show() def render(self): time.sleep(self.timeDelay) def format_time(self, seconds): if seconds < 400: s = float(seconds) return "%.1f seconds" % (s, ) elif seconds < 4000: m = seconds / 60.0 return "%.2f minutes" % (m, ) else: h = seconds / 3600.0 return "%.2f hours" % (h, ) def reward(self, s_, s): """ rewarding scheme """ self.target = self.selected_targets[0] if s_ == self.cv.coords(self.selected_targets[0]): t = self.cv.create_image(s_, image=self.cv.arriveImage[0]) self.createMarkA = t reward = 1 done = True elif s_ in self.selected_Obstacles_position: reward = -0.75 done = False else: reward = -0.04 done = False return reward, done def calcu_next_state(self, loc, action): """ calculate next state based on location and action """ UNIT = self.grid_UNIT ss = loc np_s = np.array(ss) dissS = np.array([self.grid_origx, self.grid_origy]) s = (np_s - dissS).tolist() base_action = np.array([0, 0]) if action == 0: # up if s[1] > UNIT: base_action[1] -= UNIT elif action == 1: # down if s[1] < (self.grid_rowNum - 1) * UNIT: base_action[1] += UNIT elif action == 2: # right if s[0] < (self.grid_columnNum - 1) * UNIT: base_action[0] += UNIT elif action == 3: # left if s[0] > UNIT: base_action[0] -= UNIT s_ = [] s_ = [ss[0] + base_action[0], ss[1] + base_action[1]] return s_ def new_reward(self, s_, s): """ rewarding scheme for testing """ if s_ == self.cv.coords(self.selected_targets[0]): t = self.cv.create_image(s_, image=self.cv.arriveImage[0]) self.createMark = t reward = 0 elif s_ in self.selected_Obstacles_position: reward = -2 else: reward = 0 return reward def update(self): """ main function for training """ self.RL = QLearningTable(actions=list(range(self.n_actions)), e_greedy=self.epsilon) episode = 0 action = -1 stepCount = 0 total_reward_list = [] avg_reward_list = [] win_history = [] observation = self.cv.coords(self.agent) visited = set() total_reward = 0 start_time = datetime.datetime.now() self.labelHello = Label(self.cv, text="start training!", font=("Helvetica", 10), width=10, fg="red", bg="white") self.labelHello.place(x=200, y=750, anchor=NW) while True: self.labelHello = Label(self.cv, text="episode: %s" % str(episode), font=("Helvetica", 10), width=10, fg="blue", bg="white") self.labelHello.place(x=200, y=550, anchor=NW) self.render() visited.add(tuple(observation)) stepCount += 1 action = self.RL.choose_action(str(observation)) observation_ = self.calcu_next_state(observation, action) reward, done = self.reward(observation_, observation) self.cv.coords(self.selected_agent[0], observation_) if tuple(observation_) in visited: reward -= 0.25 if observation == observation_: reward = reward - 0.8 if done == True: win_history.append(1) total_reward += reward if total_reward < -0.5 * 64: done = True win_history.append(0) self.RL.learn(str(observation), action, reward, str(observation_)) # swap observation observation = observation_ # break while loop when end of this episode if done: if episode > self.episode: break else: observation = self.reset() dt = datetime.datetime.now() - start_time t = self.format_time(dt.total_seconds()) total_reward_list.append(total_reward) if len(total_reward_list) > 100: avg_reward = sum(total_reward_list[-100:]) / 100 avg_reward_list.append(avg_reward) template = "Episode: {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | Average rewards: {:.3f} | time: {}" print( template.format( episode, self.episode, stepCount, sum(win_history) / len(win_history), total_reward, avg_reward, t)) else: template = "Episode: {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | time: {}" print( template.format( episode, self.episode, stepCount, sum(win_history) / len(win_history), total_reward, t)) episode += 1 stepCount = 0 total_reward = 0 visited = set() done = 0 # end of training print('training over!') self.labelHello = Label(self.cv, text="training end!", font=("Helvetica", 10), width=10, fg="red", bg="white") self.labelHello.place(x=200, y=750, anchor=NW) print("total_win_rate", sum(win_history) / len(win_history)) print("total_time", t) print("average rewards per episode", sum(total_reward_list) / len(total_reward_list)) self.learning = False self.reset() plt.figure() plt.title('Rewards per Episode') plt.xlabel('Episode number') plt.ylabel('Rewards') plt.plot(total_reward_list) plt.show() plt.figure() plt.title('Average Rewards over 100 Episode') plt.xlabel('Episode number') plt.ylabel('Rewards') plt.plot(avg_reward_list) plt.show() def create_items(self): self.AllItemsOrigPosition_list.append([0, 0, 0, 0]) self.create_agentItems() self.agentItemIndex = [1, len(self.agentPosition_list)] self.create_targetItems() self.WarehouseItemIndex = [ self.agentItemIndex[1] + 1, self.agentItemIndex[1] + len(self.warehousePostition_list) ] self.create_ObsItems() self.ObstacleItemIndex = [ self.WarehouseItemIndex[1] + 1, self.WarehouseItemIndex[1] + len(self.ObstaclePosition_list) ] def create_grids(self, origx, origy, column, row, UNIT): # create grids for c in range(origx, origx + (column + 1) * UNIT, UNIT): x0, y0, x1, y1 = c, origy, c, origy + row * UNIT self.cv.create_line(x0, y0, x1, y1, width=2) for r in range(origy, origy + (row + 1) * UNIT, UNIT): x0, y0, x1, y1 = origx, r, origx + row * UNIT, r self.cv.create_line(x0, y0, x1, y1, width=2) def choose_type(self, i): """ function of clicking different button """ for b in self.bns: b['relief'] = RAISED self.bns[i]['relief'] = SUNKEN self.item_type = i if self.item_type == 1: # start training self.start_learning() self.bns[i]['relief'] = RAISED elif self.item_type == 2: # close simulation tool os._exit(0) elif self.item_type == 3: # save q_table temp_s = str(self.cv.coords(self.target)) + str( self.selected_Obstacles_position) self.RL.q_table.to_csv("single_qtable_%s.csv" % temp_s, index_label="index_label") print("SAVED!!!") self.labelHello = Label(self.cv, text="table saved!!", font=("Helvetica", 10), width=10, fg="red", bg="white") self.labelHello.place(x=350, y=750, anchor=NW) elif self.item_type == 0: self.button_reset() elif self.item_type == 4: # start running tests self.start_running() elif self.item_type == 5: self.restart() def start_learning(self): """ initialization for training process """ self.selected_agent = [] self.selected_targets = [] self.selected_Obstacles = [] self.selected_agent_position = [] self.selected_Obstacles_position = [] for item in range(1, self.itemsNum + 1): p = self.cv.coords(item) if p[0] >= self.grid_origx and p[1] >= self.grid_origy: if item in range(self.agentItemIndex[0], self.agentItemIndex[1] + 1): self.selected_agent.append(item) self.selected_agent_position = p elif item in range(self.WarehouseItemIndex[0], self.WarehouseItemIndex[1] + 1): self.selected_targets.append(item) elif item in range(self.ObstacleItemIndex[0], self.ObstacleItemIndex[1] + 1): self.selected_Obstacles.append(item) self.selected_Obstacles_position.append(p) if len(self.selected_agent) == 0 or len(self.selected_agent) > 1: tkinter.messagebox.showinfo( "INFO", "Please choose ONE agent for trainning!") elif len(self.selected_targets) == 0 or len(self.selected_targets) > 1: tkinter.messagebox.showinfo( "INFO", "Please choose ONE target for trainning!") else: self.agent = self.selected_agent[0] self.target = self.selected_targets[0] self.t = threading.Timer(self.timeDelay, self.update) self.t.start() self.learning = True def start_running(self): """ initialization for testing """ self.selected_agent = [] self.selected_targets = [] self.selected_Obstacles = [] self.selected_agent_position = [] self.selected_Obstacles_position = [] self.selected_targets_position = [] for item in range(1, self.itemsNum + 1): p = self.cv.coords(item) if p[0] >= self.grid_origx and p[1] >= self.grid_origy: if item in range(self.agentItemIndex[0], self.agentItemIndex[1] + 1): self.selected_agent.append(item) self.selected_agent_position = p elif item in range(self.WarehouseItemIndex[0], self.WarehouseItemIndex[1] + 1): self.selected_targets.append(item) self.selected_targets_position = p elif item in range(self.ObstacleItemIndex[0], self.ObstacleItemIndex[1] + 1): self.selected_Obstacles.append(item) self.selected_Obstacles_position.append(p) if len(self.selected_agent) <= 0 or len(self.selected_agent) > 1: tkinter.messagebox.showinfo("INFO", "Please place ONE agent on map!") elif len(self.selected_targets) == 0 or len(self.selected_targets) > 1: tkinter.messagebox.showinfo("INFO", "Please choose ONE terminal!") else: self.agent = self.selected_agent[0] self.target = self.selected_targets[0] # load Q table terminal_str = str(self.selected_targets_position) + str( self.selected_Obstacles_position) + 'episode3000' self.q_table = pd.read_csv("table terminal%s.csv" % terminal_str, index_col=0) self.t = threading.Timer(self.timeDelay, self.run) self.t.start() self.learning = True def rightClick_handler(self, event): self.start_learning() def leftClick_handler(self, event): """ bind events of choosing warehouse """ if self.learning: print("Learing on going!") else: for item in range(1, self.itemsNum + 1): position = self.cv.coords(item) R = self.grid_UNIT / 2 p = [ position[0] - R, position[1] - R, position[0] + R, position[1] + R ] if event.x>=p[0] and event.x<=p[2] and \ event.y>=p[1] and event.y<=p[3]: t = item self.choose_item_handler(event, t) def choose_item_handler(self, event, t): self.choose_item = t self.itemOrigPosition = self.cv.coords(t) def move(self, event): if self.choose_item is not None: t = self.choose_item self.cv.coords(t, event.x, event.y) def adjust_items_into_grids(self, event): if self.choose_item is not None: t = self.choose_item position = self.cv.coords(t) centerX = position[0] centerY = position[1] Grids_X0 = self.grid_origx Grids_X1 = self.grid_origx + (self.grid_columnNum + 1) * self.grid_UNIT Grids_Y0 = self.grid_origy Grids_Y1 = self.grid_origy + (self.grid_rowNum + 1) * self.grid_UNIT if (centerX in range(Grids_X0, Grids_X1)) and (centerY in range( Grids_Y0, Grids_Y1)): columnIndex = math.floor((centerX - Grids_X0) / self.grid_UNIT) rowIndex = math.floor((centerY - Grids_Y0) / self.grid_UNIT) adjustedX0 = Grids_X0 + columnIndex * self.grid_UNIT + self.grid_UNIT / 2 adjustedY0 = Grids_Y0 + rowIndex * self.grid_UNIT + self.grid_UNIT / 2 self.cv.coords(t, adjustedX0, adjustedY0) else: #return to original position if not drag near grids self.cv.coords(t, self.AllItemsOrigPosition_list[t]) self.itemOrigPosition = [] def move_end(self, event): if self.choose_item is not None: t = self.choose_item self.adjust_items_into_grids(event) self.choose_item = None def delete_item(self, event): if self.choose_item is not None: self.cv.delete(self.choose_item)
# break while loop when end of this episode if done: reward_list.append(r) break # end of game print('game over') # env.destroy() if __name__ == "__main__": env = MDP_env() n_actions = 2 n_features = 12 reward_list = [] RL = QLearningTable(n_features, actions=list(range(n_actions))) episode_memories = defaultdict(list) update() av_reward = [ np.mean(reward_list[0:i + 1]) for i in range(len(reward_list)) ] plt.plot(np.arange((len(reward_list))), av_reward) plt.show() np.set_printoptions(suppress=True) RL.q_table['a'] = RL.q_table.idxmax(axis=1) print( np.hstack([ np.arange(6).reshape(6, 1), RL.q_table.sort_index(axis=0, ascending=True).values ])) print("")
for i in range(0, int(num_days) * 10): if i == 0: use_new_table = True else: use_new_table = False Text_Table = Text if day % 10 == 0: learn_single_day = False epsilon = 0.1 start_day = very_start_day end_day = day else: learn_single_day = True epsilon = 1 start_day = day - 5 end_day = start_day + 10 Text = ''.join(str(elem) for elem in Text_list) print("Day: ", str(day / 10), ", Exp name: ", Text, ", Use new Table: ", use_new_table, ", epsilon: ", epsilon, ", Train(False)/Test(true): ", learn_single_day) time.sleep(5) RL = QLearningTable(actions=list(range(tot_action)), Text=Text, Text_Table=Text_Table, use_new_table=use_new_table, epsilon=epsilon) update(start_day, end_day) day += 5 Text_list[-1] = day
else: is_hell = False step_num += 1 # RL learn from this transition RL.learn(str(observation), action, reward, str(observation_)) # swap observation observation = observation_ # break while loop when end of this episode if done: break # end of game print('game over') env.destroy( ) # after operating 100 times game over and destroy the environment # 相关注释网站https://cloud.tencent.com/developer/article/1148483 # https://blog.csdn.net/duanyajun987/article/details/78614902 if __name__ == "__main__": env = Maze() # 使用tkinter 初始化和创建环境 RL = QLearningTable(actions=list(range(env.n_actions))) # 定义强化学习类,初始化相关参数 env.after(100, update) # call update() after 100ms # update() env.mainloop() # # update() #放在后面就用不了
if __name__ == "__main__": df_re = pd.read_csv(os.path.dirname(os.getcwd()) + "/dataset/" + configuration.CITY + '_public_node_relation.csv', encoding='utf-8') df_co = pd.read_csv(os.path.dirname(os.getcwd()) + "/dataset/" + configuration.CITY + '_node&tel.csv', encoding='utf-8') x = df_co['lon'].round(decimals=6).tolist() y = df_co['lat'].round(decimals=6).tolist() cross_relation = tools.get_cross_info(df_re) cross_info = df_co.values.tolist() next_state_list, distance_list, action_list, tel_list = tools.get_details( cross_relation) # TODO Start_Point & End_Point 待输入 for i in range(166, 288): np.random.seed(i) start_point = np.random.randint(0, 800) end_point = np.random.randint(801, 1725) RL = QLearningTable(ACTIONS) env = Cross(next_state_list, action_list, distance_list, start_point, end_point, cross_info) q_table = update(env, start_point, end_point) q_table.to_csv(os.getcwd() + '/table/' + configuration.CITY + '_' + str(start_point) + '_' + str(end_point) + '_' + 'q_table.csv', encoding="utf-8") # df_q_tb = pd.read_csv(os.getcwd() + '/table/0_363_q_table.csv', encoding='utf-8') # q_table = df_q_tb.values.tolist()
def update(): for episode in range(100): # initial observation observation = while True: # RL choose action based on observation action = RL.choose_action(str(observation)) # RL take action and get next observation and reward observation_, reward, done = handle.ApplyForceOnJoint(action) # RL learn from this transition RL.learn(str(observation), action, reward, str(observation_)) # swap observation observation = observation_ # break while loop when end of this episode if done: break # end of game print('Model_ready') if __name__ == "__main__": Model = handle() RL = QLearningTable(actions=list(range(Model.n_actions))) update()
import environment from RL_brain import QLearningTable import numpy as np env = Maze() RL = QLearningTable(actions=list(range(env.n_actions))) N = 20 dt = 2 * np.pi / N ep_max = 500 fidelity = np.zeros(ep_max) RL = QLearningTable(actions=list(range(env.n_actions))) fid_10 = 0 for episode in range(ep_max): observation = env.reset() while True: action = RL.choose_action(str(observation)) observation_, reward, done, fid = env.step(action) RL.learn(str(observation), action, reward, str(observation_)) observation = observation_ if done: if episode >= ep_max - 11: fid_10 = max(fid_10, fid) break print('Final_fidelity=', fid_10)
# -*- coding: utf-8 -*- from maze_env import Maze from RL_brain import QLearningTable def update(): for episode in range(100): observation = env.reset() while True: env.render() action = RL.choose_action(str(observation)) observation_, reward, done = env.step(action) RL.learn(str(observation), action, reward, str(observation_)) observation = observation_ if done: break print 'game over' env.destroy() env = Maze() RL = QLearningTable(actions=list(range(env.n_actions))) env.after(100, update) env.mainloop()
def update(self): """ main function for training """ self.RL = QLearningTable(actions=list(range(self.n_actions)), e_greedy=self.epsilon) episode = 0 action = -1 stepCount = 0 total_reward_list = [] avg_reward_list = [] win_history = [] observation = self.cv.coords(self.agent) visited = set() total_reward = 0 start_time = datetime.datetime.now() self.labelHello = Label(self.cv, text="start training!", font=("Helvetica", 10), width=10, fg="red", bg="white") self.labelHello.place(x=200, y=750, anchor=NW) while True: self.labelHello = Label(self.cv, text="episode: %s" % str(episode), font=("Helvetica", 10), width=10, fg="blue", bg="white") self.labelHello.place(x=200, y=550, anchor=NW) self.render() visited.add(tuple(observation)) stepCount += 1 action = self.RL.choose_action(str(observation)) observation_ = self.calcu_next_state(observation, action) reward, done = self.reward(observation_, observation) self.cv.coords(self.selected_agent[0], observation_) if tuple(observation_) in visited: reward -= 0.25 if observation == observation_: reward = reward - 0.8 if done == True: win_history.append(1) total_reward += reward if total_reward < -0.5 * 64: done = True win_history.append(0) self.RL.learn(str(observation), action, reward, str(observation_)) # swap observation observation = observation_ # break while loop when end of this episode if done: if episode > self.episode: break else: observation = self.reset() dt = datetime.datetime.now() - start_time t = self.format_time(dt.total_seconds()) total_reward_list.append(total_reward) if len(total_reward_list) > 100: avg_reward = sum(total_reward_list[-100:]) / 100 avg_reward_list.append(avg_reward) template = "Episode: {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | Average rewards: {:.3f} | time: {}" print( template.format( episode, self.episode, stepCount, sum(win_history) / len(win_history), total_reward, avg_reward, t)) else: template = "Episode: {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | time: {}" print( template.format( episode, self.episode, stepCount, sum(win_history) / len(win_history), total_reward, t)) episode += 1 stepCount = 0 total_reward = 0 visited = set() done = 0 # end of training print('training over!') self.labelHello = Label(self.cv, text="training end!", font=("Helvetica", 10), width=10, fg="red", bg="white") self.labelHello.place(x=200, y=750, anchor=NW) print("total_win_rate", sum(win_history) / len(win_history)) print("total_time", t) print("average rewards per episode", sum(total_reward_list) / len(total_reward_list)) self.learning = False self.reset() plt.figure() plt.title('Rewards per Episode') plt.xlabel('Episode number') plt.ylabel('Rewards') plt.plot(total_reward_list) plt.show() plt.figure() plt.title('Average Rewards over 100 Episode') plt.xlabel('Episode number') plt.ylabel('Rewards') plt.plot(avg_reward_list) plt.show()
from matplotlib import rcParams rcParams['font.family'] = 'serif' rcParams['font.serif'] = ['Times New Roman'] ###############################read data################################### df = pd.read_csv('YearData.csv') D = df['demand'].values P = df['Price Data'].values for BAR_POR in [1]: B = 8820 * BAR_POR XIN = 0.7 FUL = 1 W = 2 LEVEL = np.array([20, 2000, 2000]) RL = QLearningTable(actions=list(range(3))) BF = BFramework(XIN, FUL, LEVEL, B, W) socour_o, socrl_o, socmpc_o, socnos_o, socthb_o, socofl_o = BF.general_performance_sys( D, P) socour_ori = socour_o[0, :, :] socrl_ori = socrl_o[0, :, :] socmpc_ori = socmpc_o[0, :, :] socnos_ori = socnos_o[0, :, :] socthb_ori = socthb_o[0, :, :] socofl_ori = socofl_o[0, :, :] socour = np.zeros((731, 2)) socrl = np.zeros((731, 2)) socmpc = np.zeros((731, 2)) socnos = np.zeros((731, 2))
print('update') for episode in range(100): # initialize observation observation = env.reset() while True: # fresh env env.render() action = RL.choose_action(str(observation)) observation_, reward, done = env.step(action) RL.learn(str(observation), action, reward, str(observation_)) observation = observation_ if done: break print('game over') env.destroy() if __name__ == '__main__': env = Maze() RL = QLearningTable(actions=list(range(env.n_actions)),e_greedy=0.9) print(RL.q_table) env.after(100, update) env.mainloop()