Esempi in Python per QLearningTable.learn

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: RL_brain

Classe/tipologia: QLearningTable

Metodo/funzione: learn

Esempi su hotexamples.com: 10

QLearningTable.learn in Python: 10 esempi trovati. Questi sono i migliori esempi reali in Python per RL_brain.QLearningTable.learn, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

QLearningTable(30)

choose_action(12)

learn(10)

epsilon(5)

feedQTable(3)

q_table(3)

choose_action_real(2)

gamma(1)

plotCumulativeReward(1)

plot_cost(1)

print_q_table(1)

saveQTable(1)

save_q_table(1)

step(1)

Esempio n. 1

Mostra file

File: main.py Progetto: 0aqz0/path-planning-with-qlearning

    def pathplanning(self):
        global root
        global view
        RL = QLearningTable(actions=list(range(self.n_actions)),
                            learning_rate=self._learningrate,
                            reward_decay=self._discountfactor,
                            e_greedy=self._egreedy)
        # update qtable
        self.currentqtable = str(RL.q_table)
        for episode in range(self._maxepisode):
            # update episode
            self.currentepisode = episode + 1

            # reset
            self._robot = self._start.copy()
            # initialize observation
            observation = str(self._robot)
            time.sleep(1)

            while True:
                # record the final path
                if (episode == self._maxepisode - 1):
                    self.finalpath.append(
                        str("(" + str(int(self._robot[0])) + "," +
                            str(int(self._robot[1])) + ")"))

                # choose action
                action = RL.choose_action(observation)
                # get new observation
                next_observation, reward, done = self.step(action)
                # learn from this observation
                RL.learn(observation, action, reward, next_observation)
                # update observation
                observation = next_observation

                # update qtable
                self.currentqtable = str(RL.q_table)
                # sleep for qml's update
                time.sleep(0.2)
                # print("#######")
                if done:
                    break
            # print(self.finalpath)
        self.isfinalpath = True

Esempio n. 2

Mostra file

File: run_this.py Progetto: Fassial/Machine-Learning

def main():
    env = Maze()
    RL = QLearningTable(actions=list(range(env.n_actions)))

    for episode in range(100):
        if episode % 200 == 0:
            RL.save_q_table()
        # initial observation
        observation = env.reset()
        counter = 0

        while True:
            # fresh env
            env.render()
            print("Round: " + str(counter))

            # RL choose action based on observation
            action = RL.choose_action(observation)

            # RL take action and get next observation & reward
            observation_, reward, done = env.step(action)

            # RL learn from this transition
            RL.learn(observation, action, reward, observation_, done)

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                # RL.save_q_table()
                break
            else:
                time.sleep(1)
                counter += 1

        # end game
        print("end game")

    # save q_table
    RL.save_q_table()

Esempio n. 3

Mostra file

def update():
    start = time.time()
    RL = QLearningTable(n_states=nodes_num,
                        each_services_nums=each_services_nums,
                        max_services_num=max_services_num,
                        nodeSet_file=nodeSet_file,
                        conf_file=conf_file,
                        learning_rate=ALPHA,
                        reward_decay=GAMMA,
                        e_greedy=EPSILON)
    max_reward = 0

    for episode in range(MAX_EPISODES):
        # initial observation
        state = 0
        # print("episode = {}".format(episode))

        while True:
            # RL choose action based on observation
            action = RL.choose_action(state)

            # RL take action and get next observation and reward
            state_, reward, done = RL.step(state, action)

            # print("s = {0}, a = {1}, s_ = {2}, reward = {3}".format(
            #     state, action, state_, reward
            # ))

            # RL learn from this transition
            RL.learn(state, action, reward, state_)

            # swap observation
            state = state_

            # break while loop when end of this episode
            if done:
                # print("services = {0}, reward = {1}, runtime = {2}, episode = {3} ".format
                #       (RL.choose_services, reward, time.time()-start, episode))
                if episode == 0:
                    max_reward = reward
                else:
                    if reward > max_reward:
                        max_reward = reward
                        print(
                            "services = {0}, reward = {1}, runtime = {2}, episode = {3} "
                            .format(RL.choose_services, reward,
                                    time.time() - start, episode))
                        line = [x for x in RL.choose_services]
                        line.append(reward)
                        line.append(time.time() - start)
                        line.append(episode)
                        # print(line)
                        fp = open(outfile, 'a+')
                        fp.write(str(line) + '\n')
                        fp.close()
                    else:
                        if episode % 100 == 0:
                            print("episode = {}".format(episode))
                break

        # 终止条件
        if episode >= ERROR_COUNT:
            del judge_list[0]
        judge_list.append(reward)

        if episode >= 1000 and episode % ERROR_COUNT == 0:
            if max(judge_list) - min(judge_list) <= ERROR_RANGE:
                output = "\n  达到收敛条件，提前终止实验！\n"
                line = [x for x in RL.choose_services]
                line.append(reward)
                line.append(time.time() - start)
                line.append(episode)
                # 打印收敛结果
                print(output)
                print(line)
                # 记录收敛结果
                fp = open(outfile, 'a+')
                fp.write(output)
                fp.write(str(line) + '\n')
                fp.close()
                break

    print('game over')

Esempio n. 4

Mostra file

File: run_this_qubit.py Progetto: tuliplan/RL_state_preparation

import environment
from RL_brain import QLearningTable
import numpy as np

env = Maze()
RL = QLearningTable(actions=list(range(env.n_actions)))

N = 20
dt = 2 * np.pi / N
ep_max = 500
fidelity = np.zeros(ep_max)

RL = QLearningTable(actions=list(range(env.n_actions)))
fid_10 = 0
for episode in range(ep_max):
    observation = env.reset()
    while True:

        action = RL.choose_action(str(observation))
        observation_, reward, done, fid = env.step(action)
        RL.learn(str(observation), action, reward, str(observation_))
        observation = observation_
        if done:
            if episode >= ep_max - 11:
                fid_10 = max(fid_10, fid)

            break

print('Final_fidelity=', fid_10)

Esempio n. 5

Mostra file

class App:
    def __init__(self, master):
        self.master = master

        #        grid map setting
        self.grid_origx = 500
        self.grid_origy = 20
        self.grid_columnNum = 8
        self.grid_rowNum = 8
        self.grid_UNIT = 90
        self.maze_size = self.grid_columnNum * self.grid_rowNum
        #        define total training episodes
        self.episode = 1000
        #        define number of tests to run
        self.tests = 100
        #        set a small amount of delay (second) to make sure tkinter works properly
        #        if want to have a slower visulazation for testing, set the delay to larger values
        self.timeDelay = 0.005

        #       other initialization
        self.n_actions = 4
        self.outline = 'black'
        self.fill = None
        self.item_type = 0
        self.learning = False
        self.itemsNum = 0
        self.epsilon = 0.9
        self.Qtable_origx = self.grid_origx + 20 + (self.grid_columnNum +
                                                    1) * self.grid_UNIT
        self.Qtable_origy = self.grid_origy
        self.grid_origx_center = self.grid_origx + self.grid_UNIT / 2
        self.grid_origy_center = self.grid_origy + self.grid_UNIT / 2
        self.Qtable_gridIndex_dict = {}
        self.show_q_table = pd.DataFrame(columns=list(range(self.n_actions)),
                                         dtype=np.float64)
        self.origDist = 10
        self.agentCentre = np.array([[190, 180], [290, 180], [390, 180]])
        self.warehouseCentre = self.agentCentre+np.array([[0,self.grid_UNIT+self.origDist],\
                                [0,self.grid_UNIT+self.origDist],[0,self.grid_UNIT+self.origDist]])
        self.ObstacleCentre1 = np.array([[725, 515], [725, 335], [635, 695]])

        self.ObstacleCentre2 = np.array([[905, 245], [545, 245], [995, 605]])
        self.itemOrigPosition = []
        self.agentPosition_list = []
        self.warehousePostition_list = []
        self.ObstaclePosition_list = []
        self.WarehouseItemIndex = []
        self.agentItemIndex = []
        self.ObstacleItemIndex = []
        self.AllItemsOrigPosition_list = []
        self.createMark = None
        self.points = []
        self.cars_list = []
        self.selected_agent = []
        self.selected_agent_position = []
        self.selected_Obstacles_position = []
        self.selected_Obstacles = []
        self.selected_targets = []
        self.agent = 1
        self.target = 4
        self.hell1 = 7
        self.hell2 = 8
        self.init_widgets()
        self.temp_item = None
        self.temp_items = []
        self.choose_item = None
        self.created_line = []
        self.lines = []

    def resize(self, w, h, w_box, h_box, pil_image):
        ''''' 
      resize a pil_image 
      '''
        return pil_image.resize((w_box, h_box), Image.ANTIALIAS)

    def init_widgets(self):

        self.cv = Canvas(root, background='white')
        self.cv.pack(fill=BOTH, expand=True)
        # bind events of dragging with mouse
        self.cv.bind('<B1-Motion>', self.move)
        self.cv.bind('<ButtonRelease-1>', self.move_end)
        self.cv.bind("<Button-1>", self.leftClick_handler)

        # bind events of double-left-click
        self.cv.bind("<Button-3>", self.rightClick_handler)
        f = ttk.Frame(self.master)
        f.pack(fill=X)
        self.bns = []

        # initialize buttons
        for i, lb in enumerate(
            ('Reset', 'Start trainning', 'Close', 'Save', 'Start Running')):
            bn = Button(f, text=lb, command=lambda i=i: self.choose_type(i))
            bn.pack(side=LEFT, ipadx=8, ipady=5, padx=5)
            self.bns.append(bn)
        self.bns[self.item_type]['relief'] = SUNKEN

        #initialize agent, warehouses and obstacles positions
        self.agentPosition_list = self.setItemsPositionList(self.agentCentre)
        self.warehousePostition_list = self.setItemsPositionList(
            self.warehouseCentre)
        self.ObstaclePosition_list1 = self.setItemsPositionList(
            self.ObstacleCentre1)
        self.ObstaclePosition_list2 = self.setItemsPositionList(
            self.ObstacleCentre1)
        self.ObstaclePosition_list = self.ObstaclePosition_list1 + self.ObstaclePosition_list2
        self.create_items()
        self.itemsNum = self.warehouseCentre.shape[
            0] + self.ObstacleCentre1.shape[0] + self.ObstacleCentre2.shape[
                0] + self.agentCentre.shape[0]
        R = self.grid_UNIT
        self.cv.create_text(self.agentCentre[0][0]-R-20,self.agentCentre[0][1],\
                            text = "Agent:",font=('Courier',18))
        self.cv.create_text(self.warehouseCentre[0][0]-R-20,self.warehouseCentre[0][1],\
                            text = "Warehouse:",font=('Couried',18))
        self.cv.create_text(self.grid_origx+250,self.grid_origy-50, text = "Single agent Q-Learning Simulation",\
                            font=('Times',38),fill = 'red')
        self.cv.create_text(self.grid_origx+252,self.grid_origy-52, text = "Single agent Q-Learning Simulation",\
                            font=('Times',38),fill = 'green')

        #draw grids
        self.create_grids(self.grid_origx, self.grid_origy,
                          self.grid_columnNum, self.grid_rowNum,
                          self.grid_UNIT)

        for i in range(0, self.grid_rowNum):
            for j in range(0, self.grid_columnNum):
                x = i * self.grid_UNIT + self.grid_origx_center
                y = j * self.grid_UNIT + self.grid_origy_center
                rowIndex = (y - self.grid_origy_center) / self.grid_UNIT
                columnIndex = (x - self.grid_origx_center) / self.grid_UNIT
                self.Qtable_gridIndex_dict[(
                    x, y)] = rowIndex * self.grid_columnNum + columnIndex

        print(self.Qtable_gridIndex_dict)

    def create_ObsItems(self):
        self.cv.arriveObsImage = []
        self.cv.bms_obs = []
        w_box, h_box = self.grid_UNIT, self.grid_UNIT

        pil_image = Image.open('obs5.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image1 = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_obs.append(tk_image1)

        pil_image = Image.open('obs7.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image2 = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_obs.append(tk_image2)

        pil_image = Image.open('obs8.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image3 = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_obs.append(tk_image3)

        self.cv.bms_obs.append(tk_image1)
        self.cv.bms_obs.append(tk_image2)
        self.cv.bms_obs.append(tk_image3)

        self.cv.Obstacle = []
        index = 0
        for q in self.ObstacleCentre1:
            bm = self.cv.bms_obs[index]
            t = self.cv.create_image(q[0], q[1], image=bm)
            self.cv.Obstacle.append(t)
            self.AllItemsOrigPosition_list.append(self.cv.coords(t))
            index += 1
        for q in self.ObstacleCentre2:
            bm = self.cv.bms_obs[index]
            t = self.cv.create_image(q[0], q[1], image=bm)
            self.cv.Obstacle.append(t)
            self.AllItemsOrigPosition_list.append(self.cv.coords(t))
            index += 1

        #arriving picture
        pil_image = Image.open('obs5_car.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.arriveObsImage.append(tk_image)

    def create_targetItems(self):
        self.cv.arriveImage = []
        self.cv.bms_wh = []
        w_box, h_box = self.grid_UNIT, self.grid_UNIT

        pil_image = Image.open('warehouse4_1.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_wh.append(tk_image)

        pil_image = Image.open('warehouse3.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_wh.append(tk_image)

        pil_image = Image.open('warehouse4_2.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_wh.append(tk_image)

        self.cv.warehouse = []
        index = 0
        for q in self.warehouseCentre:
            bm = self.cv.bms_wh[index]
            t = self.cv.create_image(q[0], q[1], image=bm)
            self.cv.warehouse.append(t)
            self.AllItemsOrigPosition_list.append(self.cv.coords(t))
            index += 1

        #arriving picture
        pil_image = Image.open('warehouse3_car.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.arriveImage.append(tk_image)

    def create_agentItems(self):
        self.cv.bms = []
        w_box, h_box = self.grid_UNIT, self.grid_UNIT

        pil_image = Image.open('car9.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms.append(tk_image)

        pil_image = Image.open('car2.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms.append(tk_image)

        pil_image = Image.open('car8.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms.append(tk_image)

        self.cv.car = []
        index = 0
        for q in self.agentCentre:
            bm = self.cv.bms[index]
            t = self.cv.create_image(q[0], q[1], image=bm)
            self.cv.car.append(t)
            self.AllItemsOrigPosition_list.append(self.cv.coords(t))
            index += 1

    def setItemsPositionList(self, itemCentre):
        npTemp = np.hstack((itemCentre, itemCentre))
        #        print("npTemp=",npTemp)
        h_u = self.grid_UNIT / 2
        npHalfUnit = np.array([-h_u, -h_u, h_u, h_u])
        hs = npHalfUnit
        for diam in range(1, itemCentre.shape[0]):
            hsTemp = np.vstack((npHalfUnit, hs))
            hs = hsTemp


#            print("hs=",hs)
        return (npTemp - hs).tolist()

    def button_reset(self):
        time.sleep(self.timeDelay)
        if self.createMark is not None:
            self.cv.delete(self.createMark)
        for line in self.created_line:
            self.cv.delete(line)
        self.cv.coords(self.agent, self.selected_agent_position)

        coords = self.cv.coords(self.agent)
        return coords

    def reset(self):
        """
        reset the agent to a random valid location
        """
        if self.lines != []:
            for line in self.lines:
                self.cv.delete(line)
        Obs_list = self.ObstaclePosition_list
        while True:
            new_loc = [
                random.randrange(
                    self.grid_origx_center,
                    self.grid_rowNum * self.grid_UNIT + self.grid_origx_center,
                    self.grid_UNIT),
                random.randrange(
                    self.grid_origy_center,
                    self.grid_columnNum * self.grid_UNIT +
                    self.grid_origy_center, self.grid_UNIT)
            ]
            if new_loc not in Obs_list:
                break
        self.cv.coords(self.selected_agent[0], new_loc)
        coords = self.cv.coords(self.selected_agent[0])
        return coords

    def choose_best_action(self, state, terminal):
        """
        choose best action from Q_table
        """
        if terminal == self.cv.coords(self.target):
            q_table = self.q_table
        state_action = q_table.loc[state]
        action = np.random.choice(
            state_action[state_action == np.max(state_action)].index)
        return int(action)

    def run(self):
        """
        main function for runing tests
        """
        test = 0
        rewards = []
        action = -1
        observation = self.cv.coords(self.agent)
        done = 0
        total_reward = 0
        terminal = self.cv.coords(self.target)
        visited = [observation]
        #        enhance_list = []
        win_count = 0

        while True:
            self.labelHello = Label(self.cv,
                                    text="Test:%s" % str(test),
                                    font=("Helvetica", 10),
                                    width=10,
                                    fg="blue",
                                    bg="white")
            self.labelHello.place(x=self.agentCentre[0][0] - 150,
                                  y=self.agentCentre[0][1] + 500,
                                  anchor=NW)
            time.sleep(self.timeDelay)

            action = self.choose_best_action(str(observation), terminal)
            observation_ = self.calcu_next_state(observation, action)
            reward = self.new_reward(observation_, observation)

            if observation_ in visited:
                reward -= 0.5
            else:
                visited.append(observation_)

            if done:
                observation_ = self.cv.coords(self.target)

            self.cv.coords(self.selected_agent[0], observation_)
            total_reward += reward

            if total_reward < -1:
                done = 1

            if done != 1:
                line = self.cv.create_line(
                    observation[0],
                    observation[1],
                    observation_[0],
                    observation_[1],
                    fill='red',
                    arrow=LAST,
                    arrowshape=(10, 20, 8),  # 红色
                    dash=(4, 4)  # 虚线
                )
                self.lines.append(line)

            observation = observation_
            if self.cv.coords(self.agent) == self.cv.coords(self.target):
                done = 1

            if done:
                action = -1
                visited = []
                total_reward += 1
                if total_reward == 1:
                    win_count += 1
                rewards.append(total_reward)
                total_reward = 0
                self.reset()
                done = 0
                observation = self.cv.coords(self.agent)
                test += 1
            if test > self.tests:
                self.labelHello = Label(self.cv,
                                        text="running end!!",
                                        font=("Helvetica", 10),
                                        width=10,
                                        fg="red",
                                        bg="white")
                self.labelHello.place(x=250, y=750, anchor=NW)
                break
        print("win_count", win_count)
        plt.figure()
        plt.title('Score per Episode')
        plt.xlabel('Episode number')
        plt.ylabel('Score')
        plt.plot(rewards)
        plt.show()

    def render(self):
        time.sleep(self.timeDelay)

    def format_time(self, seconds):
        if seconds < 400:
            s = float(seconds)
            return "%.1f seconds" % (s, )
        elif seconds < 4000:
            m = seconds / 60.0
            return "%.2f minutes" % (m, )
        else:
            h = seconds / 3600.0
            return "%.2f hours" % (h, )

    def reward(self, s_, s):
        """
        rewarding scheme
        """
        self.target = self.selected_targets[0]
        if s_ == self.cv.coords(self.selected_targets[0]):
            t = self.cv.create_image(s_, image=self.cv.arriveImage[0])
            self.createMarkA = t
            reward = 1
            done = True

        elif s_ in self.selected_Obstacles_position:
            reward = -0.75
            done = False

        else:
            reward = -0.04
            done = False

        return reward, done

    def calcu_next_state(self, loc, action):
        """
        calculate next state based on location and action
        """
        UNIT = self.grid_UNIT
        ss = loc
        np_s = np.array(ss)
        dissS = np.array([self.grid_origx, self.grid_origy])
        s = (np_s - dissS).tolist()
        base_action = np.array([0, 0])
        if action == 0:  # up
            if s[1] > UNIT:
                base_action[1] -= UNIT
        elif action == 1:  # down
            if s[1] < (self.grid_rowNum - 1) * UNIT:
                base_action[1] += UNIT
        elif action == 2:  # right
            if s[0] < (self.grid_columnNum - 1) * UNIT:
                base_action[0] += UNIT
        elif action == 3:  # left
            if s[0] > UNIT:
                base_action[0] -= UNIT
        s_ = []
        s_ = [ss[0] + base_action[0], ss[1] + base_action[1]]
        return s_

    def new_reward(self, s_, s):
        """
        rewarding scheme for testing
        """
        if s_ == self.cv.coords(self.selected_targets[0]):
            t = self.cv.create_image(s_, image=self.cv.arriveImage[0])
            self.createMark = t
            reward = 0
        elif s_ in self.selected_Obstacles_position:
            reward = -2
        else:
            reward = 0
        return reward

    def update(self):
        """
        main function for training
        """
        self.RL = QLearningTable(actions=list(range(self.n_actions)),
                                 e_greedy=self.epsilon)
        episode = 0
        action = -1
        stepCount = 0
        total_reward_list = []
        avg_reward_list = []
        win_history = []
        observation = self.cv.coords(self.agent)
        visited = set()
        total_reward = 0
        start_time = datetime.datetime.now()
        self.labelHello = Label(self.cv,
                                text="start training!",
                                font=("Helvetica", 10),
                                width=10,
                                fg="red",
                                bg="white")
        self.labelHello.place(x=200, y=750, anchor=NW)
        while True:
            self.labelHello = Label(self.cv,
                                    text="episode: %s" % str(episode),
                                    font=("Helvetica", 10),
                                    width=10,
                                    fg="blue",
                                    bg="white")
            self.labelHello.place(x=200, y=550, anchor=NW)
            self.render()
            visited.add(tuple(observation))
            stepCount += 1
            action = self.RL.choose_action(str(observation))
            observation_ = self.calcu_next_state(observation, action)
            reward, done = self.reward(observation_, observation)
            self.cv.coords(self.selected_agent[0], observation_)

            if tuple(observation_) in visited:
                reward -= 0.25
            if observation == observation_:
                reward = reward - 0.8
            if done == True:
                win_history.append(1)
            total_reward += reward
            if total_reward < -0.5 * 64:
                done = True
                win_history.append(0)
            self.RL.learn(str(observation), action, reward, str(observation_))

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                if episode > self.episode:
                    break
                else:
                    observation = self.reset()
                    dt = datetime.datetime.now() - start_time
                    t = self.format_time(dt.total_seconds())
                    total_reward_list.append(total_reward)
                    if len(total_reward_list) > 100:
                        avg_reward = sum(total_reward_list[-100:]) / 100
                        avg_reward_list.append(avg_reward)
                        template = "Episode: {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | Average rewards: {:.3f} | time: {}"
                        print(
                            template.format(
                                episode, self.episode, stepCount,
                                sum(win_history) / len(win_history),
                                total_reward, avg_reward, t))
                    else:
                        template = "Episode: {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | time: {}"
                        print(
                            template.format(
                                episode, self.episode, stepCount,
                                sum(win_history) / len(win_history),
                                total_reward, t))
                    episode += 1
                    stepCount = 0
                    total_reward = 0
                    visited = set()
                    done = 0

        # end of training
        print('training over!')
        self.labelHello = Label(self.cv,
                                text="training end!",
                                font=("Helvetica", 10),
                                width=10,
                                fg="red",
                                bg="white")
        self.labelHello.place(x=200, y=750, anchor=NW)
        print("total_win_rate", sum(win_history) / len(win_history))
        print("total_time", t)
        print("average rewards per episode",
              sum(total_reward_list) / len(total_reward_list))
        self.learning = False
        self.reset()
        plt.figure()
        plt.title('Rewards per Episode')
        plt.xlabel('Episode number')
        plt.ylabel('Rewards')
        plt.plot(total_reward_list)

        plt.show()

        plt.figure()
        plt.title('Average Rewards over 100 Episode')
        plt.xlabel('Episode number')
        plt.ylabel('Rewards')
        plt.plot(avg_reward_list)

        plt.show()

    def create_items(self):
        self.AllItemsOrigPosition_list.append([0, 0, 0, 0])
        self.create_agentItems()
        self.agentItemIndex = [1, len(self.agentPosition_list)]
        self.create_targetItems()
        self.WarehouseItemIndex = [
            self.agentItemIndex[1] + 1,
            self.agentItemIndex[1] + len(self.warehousePostition_list)
        ]
        self.create_ObsItems()
        self.ObstacleItemIndex = [
            self.WarehouseItemIndex[1] + 1,
            self.WarehouseItemIndex[1] + len(self.ObstaclePosition_list)
        ]

    def create_grids(self, origx, origy, column, row, UNIT):
        # create grids
        for c in range(origx, origx + (column + 1) * UNIT, UNIT):
            x0, y0, x1, y1 = c, origy, c, origy + row * UNIT
            self.cv.create_line(x0, y0, x1, y1, width=2)
        for r in range(origy, origy + (row + 1) * UNIT, UNIT):
            x0, y0, x1, y1 = origx, r, origx + row * UNIT, r
            self.cv.create_line(x0, y0, x1, y1, width=2)

    def choose_type(self, i):
        """
        function of clicking different button
        """
        for b in self.bns:
            b['relief'] = RAISED
        self.bns[i]['relief'] = SUNKEN
        self.item_type = i
        if self.item_type == 1:
            #            start training
            self.start_learning()
            self.bns[i]['relief'] = RAISED
        elif self.item_type == 2:
            #            close simulation tool
            os._exit(0)
        elif self.item_type == 3:
            #           save q_table
            temp_s = str(self.cv.coords(self.target)) + str(
                self.selected_Obstacles_position)
            self.RL.q_table.to_csv("single_qtable_%s.csv" % temp_s,
                                   index_label="index_label")
            print("SAVED!!!")
            self.labelHello = Label(self.cv,
                                    text="table saved!!",
                                    font=("Helvetica", 10),
                                    width=10,
                                    fg="red",
                                    bg="white")
            self.labelHello.place(x=350, y=750, anchor=NW)
        elif self.item_type == 0:
            self.button_reset()
        elif self.item_type == 4:
            #            start running tests
            self.start_running()
        elif self.item_type == 5:
            self.restart()

    def start_learning(self):
        """
        initialization for training process
        """
        self.selected_agent = []
        self.selected_targets = []
        self.selected_Obstacles = []
        self.selected_agent_position = []
        self.selected_Obstacles_position = []

        for item in range(1, self.itemsNum + 1):

            p = self.cv.coords(item)

            if p[0] >= self.grid_origx and p[1] >= self.grid_origy:
                if item in range(self.agentItemIndex[0],
                                 self.agentItemIndex[1] + 1):
                    self.selected_agent.append(item)
                    self.selected_agent_position = p
                elif item in range(self.WarehouseItemIndex[0],
                                   self.WarehouseItemIndex[1] + 1):
                    self.selected_targets.append(item)
                elif item in range(self.ObstacleItemIndex[0],
                                   self.ObstacleItemIndex[1] + 1):
                    self.selected_Obstacles.append(item)
                    self.selected_Obstacles_position.append(p)

        if len(self.selected_agent) == 0 or len(self.selected_agent) > 1:
            tkinter.messagebox.showinfo(
                "INFO", "Please choose ONE agent for trainning！")
        elif len(self.selected_targets) == 0 or len(self.selected_targets) > 1:
            tkinter.messagebox.showinfo(
                "INFO", "Please choose ONE target for trainning！")
        else:
            self.agent = self.selected_agent[0]
            self.target = self.selected_targets[0]

            self.t = threading.Timer(self.timeDelay, self.update)
            self.t.start()
            self.learning = True

    def start_running(self):
        """
        initialization for testing
        """
        self.selected_agent = []
        self.selected_targets = []
        self.selected_Obstacles = []
        self.selected_agent_position = []
        self.selected_Obstacles_position = []
        self.selected_targets_position = []

        for item in range(1, self.itemsNum + 1):
            p = self.cv.coords(item)
            if p[0] >= self.grid_origx and p[1] >= self.grid_origy:
                if item in range(self.agentItemIndex[0],
                                 self.agentItemIndex[1] + 1):
                    self.selected_agent.append(item)
                    self.selected_agent_position = p
                elif item in range(self.WarehouseItemIndex[0],
                                   self.WarehouseItemIndex[1] + 1):
                    self.selected_targets.append(item)
                    self.selected_targets_position = p
                elif item in range(self.ObstacleItemIndex[0],
                                   self.ObstacleItemIndex[1] + 1):
                    self.selected_Obstacles.append(item)
                    self.selected_Obstacles_position.append(p)

        if len(self.selected_agent) <= 0 or len(self.selected_agent) > 1:
            tkinter.messagebox.showinfo("INFO",
                                        "Please place ONE agent on map!")
        elif len(self.selected_targets) == 0 or len(self.selected_targets) > 1:
            tkinter.messagebox.showinfo("INFO", "Please choose ONE terminal!")
        else:
            self.agent = self.selected_agent[0]
            self.target = self.selected_targets[0]

            #            load Q table
            terminal_str = str(self.selected_targets_position) + str(
                self.selected_Obstacles_position) + 'episode3000'
            self.q_table = pd.read_csv("table terminal%s.csv" % terminal_str,
                                       index_col=0)
            self.t = threading.Timer(self.timeDelay, self.run)
            self.t.start()
            self.learning = True

    def rightClick_handler(self, event):
        self.start_learning()

    def leftClick_handler(self, event):
        """
        bind events of choosing warehouse
        """

        if self.learning:
            print("Learing on going!")
        else:
            for item in range(1, self.itemsNum + 1):
                position = self.cv.coords(item)
                R = self.grid_UNIT / 2
                p = [
                    position[0] - R, position[1] - R, position[0] + R,
                    position[1] + R
                ]
                if event.x>=p[0] and event.x<=p[2] and \
                    event.y>=p[1] and event.y<=p[3]:
                    t = item

                    self.choose_item_handler(event, t)

    def choose_item_handler(self, event, t):

        self.choose_item = t

        self.itemOrigPosition = self.cv.coords(t)

    def move(self, event):
        if self.choose_item is not None:
            t = self.choose_item
            self.cv.coords(t, event.x, event.y)

    def adjust_items_into_grids(self, event):
        if self.choose_item is not None:
            t = self.choose_item
            position = self.cv.coords(t)
            centerX = position[0]
            centerY = position[1]
            Grids_X0 = self.grid_origx
            Grids_X1 = self.grid_origx + (self.grid_columnNum +
                                          1) * self.grid_UNIT
            Grids_Y0 = self.grid_origy
            Grids_Y1 = self.grid_origy + (self.grid_rowNum +
                                          1) * self.grid_UNIT
            if (centerX in range(Grids_X0, Grids_X1)) and (centerY in range(
                    Grids_Y0, Grids_Y1)):
                columnIndex = math.floor((centerX - Grids_X0) / self.grid_UNIT)
                rowIndex = math.floor((centerY - Grids_Y0) / self.grid_UNIT)
                adjustedX0 = Grids_X0 + columnIndex * self.grid_UNIT + self.grid_UNIT / 2
                adjustedY0 = Grids_Y0 + rowIndex * self.grid_UNIT + self.grid_UNIT / 2
                self.cv.coords(t, adjustedX0, adjustedY0)
            else:
                #return to original position if not drag near grids
                self.cv.coords(t, self.AllItemsOrigPosition_list[t])
                self.itemOrigPosition = []

    def move_end(self, event):
        if self.choose_item is not None:
            t = self.choose_item
            self.adjust_items_into_grids(event)
            self.choose_item = None

    def delete_item(self, event):
        if self.choose_item is not None:
            self.cv.delete(self.choose_item)

Esempio n. 6

Mostra file

    r = '50000'
    save_list = [100, 50000]
    # ,10000,50000,100000,200000,300000,400000,500000,600000,700000,800000,900000,1000000
    train = True
    env = envR(show=False)
    RL = QLearningTable(env.action_space, learning_rate=0.1)
    # step = 0
    # succ = 0
    # start = time.time()
    for episode in range(int(r)):
        pre_maps = env.reset()

        for i in range(100):

            action = RL.choose_action(str(pre_maps), train)

            reward, done, action_ = env.step(action)

            RL.learn(str(pre_maps), action, reward, str(env.get_maps()), done)

            pre_maps = env.get_maps()

            if done:
                break

            # step += 1
        print((episode + 1))
        if (episode + 1) in save_list:
            print("This is", episode + 1)
            test(RL)
    print('Training Over!')

Esempio n. 7

Mostra file

File: update_static.py Progetto: newliar/Experiment

    def update(self):
        # TODO Start_Point & End_Point 待输入
        for i in range(166, 288):
            np.random.seed(i)
            start_point = np.random.randint(0, 800)
            end_point = np.random.randint(801, 1725)
            RL = QLearningTable(self.actions)
            env = Cross(self.next_state_list, self.action_list,
                        self.distance_list, start_point, end_point,
                        self.cross_info)
            # update block
            time_start = time.time()
            for episode in range(100):
                # import SA
                T = 1000
                epsilon, T = tools.SA(T, episode, 100, 0.95)
                RL.epsilon = epsilon
                if epsilon > 1:
                    print("yes")
                print(epsilon)
                episode_start_time = time.time()
                plt.ion()
                observation = env.start_point
                prior_state = observation
                while True:
                    index = RL.choose_action(observation, env, 1)

                    observation_, reward, done = env.step(
                        observation, index, prior_state)

                    # print("observation_:", observation_, "observation:", observation, "prior_state:", prior_state)

                    # 画图可视化
                    # plt.clf()
                    # plt.scatter(self.x[start_point], self.y[start_point], marker='o', s=100, label='start_point', c='yellow')
                    # plt.scatter(self.x[end_point], self.y[end_point], marker='^', s=100, label='end_point', c='yellow')
                    # plt.scatter(self.x, self.y, s=15, alpha=0.3, c='green')
                    # if observation_ == 'end_point':
                    #     plt.scatter(self.x[end_point], self.y[end_point], s=15, c='red')
                    # elif observation_ == 'terminal':
                    #     plt.scatter(self.x[observation], self.y[observation], s=15, c='yellow')
                    # else:
                    #     plt.scatter(self.x[observation_], self.y[observation_], s=15, c='red')
                    # plt.pause(0.01)
                    # plt.ioff()

                    q_table = RL.learn(observation, index, reward,
                                       observation_, 1)
                    # print(q_table.loc[observation_])

                    prior_state = observation
                    observation = observation_
                    current_time = time.time()
                    if current_time - episode_start_time > 60:
                        break
                    if done:
                        break
                episode_end_time = time.time()
                print('==========================================')
                print(episode + 1, "th episode is completed, time cost:",
                      episode_end_time - episode_start_time)
                print('==========================================')
                print(q_table)

            time_end = time.time()
            print('totally completely, time cost:', time_end - time_start)
            if 1 - bool(
                    os.path.exists(os.getcwd() + '/table_' +
                                   str(configuration.Omega))):
                os.makedirs(os.getcwd() + '/table_' + str(configuration.Omega))
            q_table.to_csv(os.getcwd() + '/table_' + str(configuration.Omega) +
                           '/' + configuration.CITY + '_' + str(start_point) +
                           '_' + str(end_point) + '_' + 'q_table.csv',
                           encoding="utf-8")

Esempio n. 8

Mostra file

File: NSGAII.py Progetto: comsip/RL-NSGAII

        action = RL.choose_action(observation)
        # a = crossover_1(a1,b1)
        child = globals()[action](a1, b1)

        solution3[counter] = child

        observation_ = str([int(i) for i in solution3])

        reward = 0

        fitness1 = 0 - sum(function1(i) for i in solution3)
        fitness2 = 0 - sum(function2(i) for i in solution3)

        reward = fitness1 + fitness2

        RL.learn(observation, action, reward, observation_)

        counter += 1

    solution2 = solution + solution3
    '''
    生成新parents
    '''
    function1_values2 = [
        function1(solution2[i]) for i in range(0, 2 * POPULATION_SIZE)
    ]
    function2_values2 = [
        function2(solution2[i]) for i in range(0, 2 * POPULATION_SIZE)
    ]
    non_dominated_sorted_solution2 = fast_non_dominated_sort(
        function1_values2[:], function2_values2[:])

Esempio n. 9

Mostra file

File: update_realtime.py Progetto: newliar/Experiment

    def update_realtime(self):
        # error_point = [256, 512, 768, 3, 5, 778, 138, 779, 655, 786, 789, 793, 155, 34, 675, 420, 293, 424, 169, 428, 301,
        #                173, 431, 49, 306, 182, 439, 701, 189, 65, 322, 199, 456, 457, 461, 725, 599, 345, 732, 734, 351,
        #                98, 485, 742, 104, 490, 620, 750, 240, 753, 626, 116, 380]
        # error_point = [750, 240, 189, 155, 199, 485, 306, 457, 380, 626, 116, 461]
        error_point = [
            512, 5, 138, 779, 280, 155, 34, 675, 420, 424, 301, 430, 306, 439,
            701, 189, 317, 63, 322, 199, 457, 461, 589, 725, 215, 599, 345,
            732, 351, 609, 485, 620, 240, 626, 380
        ]
        # time_start = time.time()
        error_list = []
        # TODO Start_Point & End_Point 待输入
        # delay_col = {'s_e', 'start_point', 'end_point', 'transfer', 'queue', 'process'}
        delay_df = pd.DataFrame(columns=('s_e', 'start_point', 'end_point',
                                         'transfer', 'queue', 'process'))
        # delay_df = delay_df.append({'s_e': 'TASK_SIZE:'+str(configuration.TASK_SIZE),山西053乡道
        #                             'start_point:': 'CPU_CLOCK'+str(configuration.CPU_CLOCK),
        #                             'end_point:': 'VEHICLE_POWER'+str(configuration.VEHICLE_POWER),
        #                             'transfer': 000,
        #                             'queue': 000,
        #                             'process': 000},
        #                            ignore_index=True)
        # x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
        cost_list = []
        # for z in range(10):
        time_start = time.time()
        count = 0
        e_count = 0
        for i in range(166, 288):
            flag = False
            # 随机种子，保证和第一次训练是相同的
            np.random.seed(i)
            start_point = np.random.randint(0, 800)
            if start_point in error_point:
                continue
            count += 1
            end_point = np.random.randint(801, 1725)
            print(start_point, '-->', end_point)

            # 读取已经存在本地的Q表
            df_q_table = pd.read_csv(
                os.getcwd() + '/table_' + str(self.omega) + '/' +
                configuration.CITY + '_' + str(start_point) + '_' +
                str(end_point) + '_' + 'q_table.csv',
                encoding="utf-8")
            # print(os.getcwd() + '/table_' + str(self.omega) + '/' + configuration.CITY + '_' +
            #       str(start_point) + '_' + str(end_point) + '_' + 'q_table.csv')
            df_q_table = df_q_table.set_index(['Unnamed: 0'])
            df_q_table = df_q_table[['1', '2', '3', '4']].astype(np.float64)

            RL = QLearningTable(self.actions)

            RL.gamma = configuration.VEHICLE_POWER
            # print(self.omega)
            # 贪心策略设置为1
            # RL.epsilon = 0.95

            # 更换Q表
            RL.q_table = df_q_table

            env = Cross_2th(self.next_state_list, self.action_list,
                            self.distance_list, start_point, end_point,
                            self.cross_info, self.tel_list, self.df_tel,
                            self.omega)
            # update block
            # for循环计数
            index_for = 0
            # for循环内延迟总和计算平均值
            delay_for_sum = 0
            transfer_for_sum = 0
            queue_for_sum = 0
            process_for_sum = 0
            for episode in range(10):
                # import SA
                T = 1000
                epsilon, T = tools.SA(T, episode, 10, 0.95)
                RL.epsilon = epsilon
                if epsilon > 1:
                    print("yes")
                # print(epsilon)
                one_episode_start_time = time.time()
                # 画图
                # plt.ion()
                observation = env.start_point
                prior_state = observation
                # while循环计数
                index_while = 0
                # while循环内延迟总和计算平均值
                delay_while_sum = 0
                transfer_while_sum = 0
                queue_while_sum = 0
                process_while_sum = 0
                while True:
                    index = RL.choose_action(observation, env, 2)
                    observation_, reward, done, tel_delay, transfer_time, queue_time, process_time = \
                        env.step_2th(observation, index, prior_state)

                    # print("observation_:", observation_, "observation:", observation, "prior_state:", prior_state)

                    index_while += 1
                    delay_while_sum += tel_delay
                    transfer_while_sum += transfer_time
                    queue_while_sum += queue_time
                    process_while_sum += process_time

                    # 陷入局部最优跳出
                    current_time = time.time()
                    if current_time - one_episode_start_time > 10:
                        flag = True
                        e_count += 1
                        print('error:', start_point, 'x--x', end_point)
                        # if observation not in error_list:
                        #     error_list.append(start_point)
                        break

                    # 画图部分
                    # plt.clf()
                    # plt.scatter(self.x[start_point], self.y[start_point], marker='o', s=100, label='start_point',
                    #             c='yellow')
                    # plt.scatter(self.x[end_point], self.y[end_point], marker='^', s=100, label='end_point', c='yellow')
                    # plt.scatter(self.x, self.y, s=15, alpha=0.3, c='green')
                    # if observation_ == 'end_point':
                    #     plt.scatter(self.x[end_point], self.y[end_point], s=15, c='red')
                    # elif observation_ == 'terminal':
                    #     plt.scatter(self.x[observation], self.y[observation], s=15, c='yellow')
                    # else:
                    #     plt.scatter(self.x[observation_], self.y[observation_], s=15, c='red')
                    # plt.pause(0.1)
                    # plt.ioff()
                    #
                    df_q_table = RL.learn(observation, index, reward,
                                          observation_, 2)
                    # print(q_table[
                    #       q_table.index.values.tolist().index(str(29)):q_table.index.values.tolist().index(
                    #           str(29)) + 1])
                    # print(q_table[
                    #       q_table.index.values.tolist().index(str(77)):q_table.index.values.tolist().index(
                    #           str(77)) + 1])

                    prior_state = observation
                    observation = observation_
                    current_time = time.time()
                    if done:
                        break

                delay_while_avg = delay_while_sum / index_while
                transfer_while_avg = transfer_while_sum / index_while
                queue_while_avg = queue_while_sum / index_while
                process_while_avg = process_while_sum / index_while

                index_for += 1
                delay_for_sum += delay_while_avg
                transfer_for_sum += transfer_while_avg
                queue_for_sum += queue_while_avg
                process_for_sum += process_while_avg
                one_episode_end_time = time.time()
                # print('==========================================')
                # print(episode + 1, "th episode is completed, time cost:", one_episode_end_time - one_episode_start_time)
                # print('==========================================')
                # print(q_table)
                if flag:
                    break
            delay_avg = delay_for_sum / index_for
            transfer_avg = transfer_for_sum / index_for
            queue_avg = queue_for_sum / index_for
            process_avg = process_for_sum / index_for
            # print('transfer_avg is:', transfer_avg, 'queue_avg is:', queue_avg, 'process_avg is:', process_avg)
            delay_df = delay_df.append(
                {
                    's_e': str(start_point) + '_' + str(end_point),
                    'start_point': start_point,
                    'end_point': end_point,
                    'transfer': transfer_avg,
                    'queue': queue_avg,
                    'process': process_avg
                },
                ignore_index=True)
            # print('======================================================================')
            # print(delay_df)
            dir_path = os.getcwd() + '/table_realtime_Ω_' + str(
                self.omega) + '_ts_' + str(
                    configuration.TASK_SIZE) + '_cc_' + str(
                        configuration.CPU_CLOCK) + '_vp_' + str(
                            configuration.VEHICLE_POWER)
            # print(dir_path)
            if 1 - bool(os.path.exists(dir_path)):
                os.makedirs(dir_path)
                os.makedirs(dir_path + '/time_cost/')
            df_q_table.to_csv(dir_path + '/' + configuration.CITY + '_' +
                              str(start_point) + '_' + str(end_point) +
                              '_realtime_q_table.csv',
                              encoding="utf-8")
            delay_df.to_csv(dir_path + '/time_cost/' + 'TASK_SIZE_' +
                            str(configuration.TASK_SIZE) + '_CPU_CLOCK_' +
                            str(configuration.CPU_CLOCK) + '_VEHICLE_POWER_' +
                            str(configuration.VEHICLE_POWER) +
                            '_time_cost.csv',
                            encoding="utf-8")
        #     跳出z循环
        #     if count - e_count == 5*(z+1):
        #         break
        time_end = time.time()
        time_cost = time_end - time_start - e_count * 10
        c_minus = count - e_count
        # cost_pre = time_cost*(round(10/(count-e_count), 3))
        print('totally completely, time cost:', time_cost)
        # print(c_minus)
        # print(cost_pre)
        print('==========================================')
        cost_list.append(time_cost)
        print(cost_list)

Esempio n. 10

Mostra file

File: multi-agent_Qlearning_final.py Progetto: suinaowawa/Reinforcement-Learning-Project

class App:
    def __init__(self, master):
        self.master = master

        #        grid map setting
        self.grid_origx = 500
        self.grid_origy = 20
        self.grid_columnNum = 8
        self.grid_rowNum = 8
        self.grid_UNIT = 90
        self.maze_size = self.grid_columnNum * self.grid_rowNum
        #        define total training episodes
        self.episode = 5000
        #        define number of tests to run
        self.tests = 100
        #        set a small amount of delay (second) to make sure tkinter works properly
        #        if want to have a slower visulazation for testing, set the delay to larger values
        self.timeDelay = 0.0005

        #       other initialization
        self.n_actions = 4
        self.outline = 'black'
        self.fill = None
        self.item_type = 0
        self.learning = False
        self.itemsNum = 0
        self.epsilon = 0.9
        self.Qtable_origx = self.grid_origx + 20 + (self.grid_columnNum +
                                                    1) * self.grid_UNIT
        self.Qtable_origy = self.grid_origy
        self.grid_origx_center = self.grid_origx + self.grid_UNIT / 2
        self.grid_origy_center = self.grid_origy + self.grid_UNIT / 2
        self.grid_endx = self.grid_origx + self.grid_columnNum * self.grid_UNIT
        self.grid_endy = self.grid_origy + self.grid_rowNum * self.grid_UNIT
        self.Qtable_gridIndex_dict = {}
        self.show_q_table = pd.DataFrame(columns=list(range(self.n_actions)),
                                         dtype=np.float64)
        self.origDist = 10
        self.agentCentre = np.array([[190, 180], [290, 180], [390, 180]])
        self.warehouseCentre = self.agentCentre+np.array([[0,self.grid_UNIT+self.origDist],\
                                [0,self.grid_UNIT+self.origDist],[0,self.grid_UNIT+self.origDist]])
        self.ObstacleCentre1 = np.array([[725, 515], [725, 335], [635, 695]])

        self.ObstacleCentre2 = np.array([[905, 245], [545, 245], [995, 605]])
        self.itemOrigPosition = []
        self.agentPosition_list = []
        self.warehousePostition_list = []
        self.ObstaclePosition_list = []
        self.WarehouseItemIndex = []
        self.agentItemIndex = []
        self.ObstacleItemIndex = []
        self.AllItemsOrigPosition_list = []
        self.createMark = None
        self.points = []
        self.cars_list = []
        self.selected_agent = []
        self.selected_agent_position = []
        self.selected_Obstacles_position = []
        self.selected_Obstacles = []
        self.selected_targets = []
        self.agent = 1
        self.target = 4
        self.hell1 = 7
        self.hell2 = 8
        self.init_widgets()
        self.temp_item = None
        self.temp_items = []
        self.choose_item = None
        self.createMarkA = None
        self.createMarkB = None
        self.linesA = []
        self.linesB = []

    def resize(self, w, h, w_box, h_box, pil_image):
        ''''' 
      resize a pil_image 
      '''
        return pil_image.resize((w_box, h_box), Image.ANTIALIAS)

    def init_widgets(self):
        self.cv = Canvas(root, background='white')
        self.cv.pack(fill=BOTH, expand=True)
        # bind events of dragging with mouse
        self.cv.bind('<B1-Motion>', self.move)
        self.cv.bind('<ButtonRelease-1>', self.move_end)
        self.cv.bind("<Button-1>", self.leftClick_handler)

        # bind events of double-left-click
        self.cv.bind("<Button-3>", self.rightClick_handler)
        f = ttk.Frame(self.master)
        f.pack(fill=X)
        self.bns = []

        # initialize buttons
        for i, lb in enumerate(
            ('Reset', 'Start trainning', 'Close', 'Save', 'Start Running')):
            bn = Button(f, text=lb, command=lambda i=i: self.choose_type(i))
            bn.pack(side=LEFT, ipadx=8, ipady=5, padx=5)
            self.bns.append(bn)
        self.bns[self.item_type]['relief'] = SUNKEN

        #initialize agent, warehouses and obstacles positions
        self.agentPosition_list = self.setItemsPositionList(self.agentCentre)
        self.warehousePostition_list = self.setItemsPositionList(
            self.warehouseCentre)
        self.ObstaclePosition_list1 = self.setItemsPositionList(
            self.ObstacleCentre1)
        self.ObstaclePosition_list2 = self.setItemsPositionList(
            self.ObstacleCentre1)
        self.ObstaclePosition_list = self.ObstaclePosition_list1 + self.ObstaclePosition_list2
        self.create_items()
        self.itemsNum = self.warehouseCentre.shape[
            0] + self.ObstacleCentre1.shape[0] + self.ObstacleCentre2.shape[
                0] + self.agentCentre.shape[0]
        R = self.grid_UNIT
        self.cv.create_text(self.agentCentre[0][0]-R-20,self.agentCentre[0][1],\
                            text = "Agent:",font=('Courier',18))
        self.cv.create_text(self.warehouseCentre[0][0]-R-20,self.warehouseCentre[0][1],\
                            text = "Warehouse:",font=('Couried',18))
        self.cv.create_text(self.grid_origx+250,self.grid_origy-50, text = "Single agent Q-Learning Simulation",\
                            font=('Times',38),fill = 'red')
        self.cv.create_text(self.grid_origx+252,self.grid_origy-52, text = "Single agent Q-Learning Simulation",\
                            font=('Times',38),fill = 'green')

        #draw grids
        self.create_grids(self.grid_origx, self.grid_origy,
                          self.grid_columnNum, self.grid_rowNum,
                          self.grid_UNIT)

        for i in range(0, self.grid_rowNum):
            for j in range(0, self.grid_columnNum):
                x = i * self.grid_UNIT + self.grid_origx_center
                y = j * self.grid_UNIT + self.grid_origy_center
                rowIndex = (y - self.grid_origy_center) / self.grid_UNIT
                columnIndex = (x - self.grid_origx_center) / self.grid_UNIT
                self.Qtable_gridIndex_dict[(
                    x, y)] = rowIndex * self.grid_columnNum + columnIndex

        print(self.Qtable_gridIndex_dict)

    def create_ObsItems(self):
        self.cv.arriveObsImage = []
        self.cv.bms_obs = []
        w_box, h_box = self.grid_UNIT, self.grid_UNIT

        pil_image = Image.open('obs5.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image1 = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_obs.append(tk_image1)

        pil_image = Image.open('obs7.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image2 = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_obs.append(tk_image2)

        pil_image = Image.open('obs8.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image3 = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_obs.append(tk_image3)

        self.cv.bms_obs.append(tk_image1)
        self.cv.bms_obs.append(tk_image2)
        self.cv.bms_obs.append(tk_image3)

        self.cv.Obstacle = []
        index = 0
        for q in self.ObstacleCentre1:
            bm = self.cv.bms_obs[index]
            t = self.cv.create_image(q[0], q[1], image=bm)
            self.cv.Obstacle.append(t)
            self.AllItemsOrigPosition_list.append(self.cv.coords(t))
            index += 1
        for q in self.ObstacleCentre2:
            bm = self.cv.bms_obs[index]
            t = self.cv.create_image(q[0], q[1], image=bm)
            self.cv.Obstacle.append(t)
            self.AllItemsOrigPosition_list.append(self.cv.coords(t))
            index += 1

        #arriving picture
        pil_image = Image.open('obs5_car.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.arriveObsImage.append(tk_image)

    def create_targetItems(self):
        self.cv.arriveImage = []
        self.cv.bms_wh = []
        w_box, h_box = self.grid_UNIT, self.grid_UNIT

        pil_image = Image.open('warehouse4_1.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_wh.append(tk_image)

        pil_image = Image.open('warehouse3.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_wh.append(tk_image)

        pil_image = Image.open('warehouse4_2.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_wh.append(tk_image)

        self.cv.warehouse = []
        index = 0
        for q in self.warehouseCentre:
            bm = self.cv.bms_wh[index]
            t = self.cv.create_image(q[0], q[1], image=bm)
            self.cv.warehouse.append(t)
            self.AllItemsOrigPosition_list.append(self.cv.coords(t))
            index += 1

        #arriving picture
        pil_image = Image.open('warehouse3_car.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.arriveImage.append(tk_image)

    def create_agentItems(self):
        self.cv.bms = []
        w_box, h_box = self.grid_UNIT, self.grid_UNIT

        pil_image = Image.open('car9.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms.append(tk_image)

        pil_image = Image.open('car2.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms.append(tk_image)

        pil_image = Image.open('car8.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms.append(tk_image)

        self.cv.car = []
        index = 0
        for q in self.agentCentre:
            bm = self.cv.bms[index]
            t = self.cv.create_image(q[0], q[1], image=bm)
            self.cv.car.append(t)
            self.AllItemsOrigPosition_list.append(self.cv.coords(t))
            index += 1

    def setItemsPositionList(self, itemCentre):
        npTemp = np.hstack((itemCentre, itemCentre))
        h_u = self.grid_UNIT / 2
        npHalfUnit = np.array([-h_u, -h_u, h_u, h_u])
        hs = npHalfUnit
        for diam in range(1, itemCentre.shape[0]):
            hsTemp = np.vstack((npHalfUnit, hs))
            hs = hsTemp
        return (npTemp - hs).tolist()

    def button_reset(self):
        time.sleep(self.timeDelay)

        for line in self.created_line:
            self.cv.delete(line)

        self.cv.coords(self.agentA, self.selected_agent_position[0])
        self.cv.coords(self.agentB, self.selected_agent_position[1])

    def reset(self, agentIndex):
        """
        reset the agent to a random valid location
        """
        if agentIndex == 0:
            if self.linesA != []:
                for line in self.linesA:
                    self.cv.delete(line)
            if self.createMarkA is not None:
                self.cv.delete(self.createMarkA)

        if agentIndex == 1:
            if self.linesB != []:
                for line in self.linesB:
                    self.cv.delete(line)
            if self.createMarkB is not None:
                self.cv.delete(self.createMarkB)

        if agentIndex != 0 and agentIndex != 1:
            ex = Exception("agentIndex Error in reset()！")
            raise ex
        Obs_list = [[725.0, 515.0], [725.0, 335.0], [635.0, 695.0],
                    [905.0, 245.0], [545.0, 245.0], [995.0, 605.0]]
        while True:
            new_loc = [
                random.randrange(
                    self.grid_origx_center,
                    self.grid_rowNum * self.grid_UNIT + self.grid_origx_center,
                    self.grid_UNIT),
                random.randrange(
                    self.grid_origy_center,
                    self.grid_columnNum * self.grid_UNIT +
                    self.grid_origy_center, self.grid_UNIT)
            ]
            if new_loc not in Obs_list:
                break
        self.cv.coords(self.selected_agent[agentIndex], new_loc)
        coords = self.cv.coords(self.selected_agent[agentIndex])
        return coords

    def reward_a(self, s_, B_s_, s, s_B):
        """
        rewarding scheme for agentA
        """
        self.targetA = self.selected_targets[0]
        if s_ == self.cv.coords(self.selected_targets[0]):
            t = self.cv.create_image(s_, image=self.cv.arriveImage[0])
            self.createMarkA = t
            reward = 1
            done = True

        elif s_ in self.selected_Obstacles_position:
            reward = -0.75
            done = False

        elif s_ == B_s_:
            reward = -0.75
            done = False

        elif s_ == s_B and B_s_ == s:
            reward = -0.75
            done = False

        else:
            reward = -0.04
            done = False
        return reward, done

    def reward_b(self, s_, A_s_, s, s_A):
        """
        rewarding scheme for agentB
        """
        self.targetB = self.selected_targets[1]
        if s_ == self.cv.coords(self.selected_targets[1]):
            t = self.cv.create_image(s_, image=self.cv.arriveImage[0])
            self.createMarkB = t
            reward = 1
            done = True

        elif s_ in self.selected_Obstacles_position:
            reward = -0.75
            done = False

        elif s_ == A_s_:
            reward = -0.75
            done = False

        elif s_ == s_A and A_s_ == s:
            reward = -0.75
            done = False
        else:
            reward = -0.04
            done = False
        return reward, done

    def real_step(self, A_s_, B_s_):
        self.cv.coords(self.selected_agent[0], A_s_)  # move agent
        self.cv.coords(self.selected_agent[1], B_s_)  # move agent
        return

    # This is a small utility for printing readable time strings:
    def format_time(self, seconds):
        if seconds < 400:
            s = float(seconds)
            return "%.1f seconds" % (s, )
        elif seconds < 4000:
            m = seconds / 60.0
            return "%.2f minutes" % (m, )
        else:
            h = seconds / 3600.0
            return "%.2f hours" % (h, )

    def update(self):
        """
        main function for training
        """
        self.RL_A = QLearningTable(actions=list(range(self.n_actions)),
                                   e_greedy=self.epsilon)
        self.RL_B = QLearningTable(actions=list(range(self.n_actions)),
                                   e_greedy=self.epsilon)
        episodeA = 0
        episodeB = 0
        action_B = -1
        action_A = -1
        UNIT = self.grid_UNIT
        stepCountA = 0
        stepCountB = 0
        total_reward_listA = []
        total_reward_listB = []
        avg_reward_listA = []
        avg_reward_listB = []
        win_historyA = []  # history of win/lose game
        win_historyB = []  # history of win/lose game
        # initial observation
        observation_A = self.cv.coords(self.agentA)
        observation_B = self.cv.coords(self.agentB)
        visitedA = set()
        visitedB = set()
        total_rewardA = 0
        total_rewardB = 0
        start_time = datetime.datetime.now()
        while True:

            self.labelHello = Label(self.cv,
                                    text="episodeA: %s" % str(episodeA),
                                    font=("Helvetica", 10),
                                    width=10,
                                    fg="red",
                                    bg="white")
            self.labelHello.place(x=200, y=550, anchor=NW)
            self.labelHello = Label(self.cv,
                                    text="episodeB: %s" % str(episodeB),
                                    font=("Helvetica", 10),
                                    width=10,
                                    fg="blue",
                                    bg="white")
            self.labelHello.place(x=200, y=580, anchor=NW)
            # fresh env
            self.render()
            visitedA.add(tuple(observation_A))
            visitedB.add(tuple(observation_B))
            stepCountA += 1
            stepCountB += 1

            distance = (observation_A[0] - observation_B[0])**2 + (
                observation_A[1] - observation_B[1])**2
            if distance == UNIT**2 or distance == 2 * (UNIT**2):

                observation_A_new = []
                observation_A_new.append(action_B)
                observation_A_new.append(observation_B)
                observation_A_new.append(observation_A)

                observation_B_new = []
                observation_B_new.append(action_A)
                observation_B_new.append(observation_A)
                observation_B_new.append(observation_B)

            else:
                observation_A_new = observation_A
                observation_B_new = observation_B

            action_A = self.RL_A.choose_action(str(observation_A_new))
            action_B = self.RL_B.choose_action(str(observation_B_new))

            A_observation_ = self.calcu_next_state(observation_A, action_A)
            B_observation_ = self.calcu_next_state(observation_B, action_B)
            reward_A, done_A = self.reward_a(A_observation_, B_observation_,
                                             observation_A, observation_B)
            reward_B, done_B = self.reward_b(B_observation_, A_observation_,
                                             observation_B, observation_A)
            self.real_step(A_observation_, B_observation_)

            if tuple(A_observation_) in visitedA:
                reward_A -= 0.25
            if tuple(B_observation_) in visitedB:
                reward_B -= 0.25
            if observation_A == A_observation_:
                reward_A = reward_A - 0.8
            if observation_B == B_observation_:
                reward_B = reward_B - 0.8
            if done_A == True:
                win_historyA.append(1)
            if done_B == True:
                win_historyB.append(1)

            total_rewardA += reward_A
            if total_rewardA < -0.5 * self.maze_size:
                done_A = True
                win_historyA.append(0)

            total_rewardB += reward_B
            if total_rewardB < -0.5 * self.maze_size:
                done_B = True
                win_historyB.append(0)

            distance = (A_observation_[0] - B_observation_[0])**2 + (
                A_observation_[1] - B_observation_[1])**2
            if distance == UNIT**2 or distance == 2 * (UNIT**2):

                observation_A_new_ = []
                observation_A_new_.append(action_B)
                observation_A_new_.append(observation_B)
                observation_A_new_.append(A_observation_)

                observation_B_new_ = []
                observation_B_new_.append(action_A)
                observation_B_new_.append(observation_A)
                observation_B_new_.append(B_observation_)

            else:
                observation_A_new_ = A_observation_
                observation_B_new_ = B_observation_

            self.RL_A.learn(str(observation_A_new), action_A, reward_A,
                            str(observation_A_new_))
            self.RL_B.learn(str(observation_B_new), action_B, reward_B,
                            str(observation_B_new_))

            observation_A = A_observation_
            observation_B = B_observation_

            # break while loop when end of this episode
            if done_A:
                if episodeA > self.episode and episodeB > self.episode:
                    break
                else:

                    observation_A = self.reset(0)
                    dt = datetime.datetime.now() - start_time
                    t = self.format_time(dt.total_seconds())
                    total_reward_listA.append(total_rewardA)
                    if len(total_reward_listA) > 100:
                        avg_rewardA = sum(total_reward_listA[-100:]) / 100
                        avg_reward_listA.append(avg_rewardA)
                        template = "Episode(A): {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | Average rewards: {:.3f} | time: {}"
                        print(
                            template.format(
                                episodeA, self.episode, stepCountA,
                                sum(win_historyA) / len(win_historyA),
                                total_rewardA, avg_rewardA, t))
                    else:
                        template = "Episode(A): {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | time: {}"
                        print(
                            template.format(
                                episodeA, self.episode, stepCountA,
                                sum(win_historyA) / len(win_historyA),
                                total_rewardA, t))
                    episodeA += 1
                    stepCountA = 0
                    total_rewardA = 0
                    visitedA = set()
                    done_A = 0

            if done_B:
                if episodeA > self.episode and episodeB > self.episode:
                    break
                else:
                    observation_B = self.reset(1)
                    dt = datetime.datetime.now() - start_time
                    t = self.format_time(dt.total_seconds())
                    total_reward_listB.append(total_rewardB)
                    if len(total_reward_listB) > 100:
                        avg_rewardB = sum(total_reward_listB[-100:]) / 100
                        avg_reward_listB.append(avg_rewardB)
                        template = "Episode(B): {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | Average rewards: {:.3f} | time: {}"
                        print(
                            template.format(
                                episodeB, self.episode, stepCountB,
                                sum(win_historyB) / len(win_historyB),
                                total_rewardB, avg_rewardB, t))
                    else:
                        template = "Episode(B): {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | time: {}"
                        print(
                            template.format(
                                episodeB, self.episode, stepCountB,
                                sum(win_historyB) / len(win_historyB),
                                total_rewardB, t))
                    episodeB += 1
                    stepCountB = 0
                    total_rewardB = 0
                    visitedB = set()
                    done_B = 0

        # end of game
        print('game over')
        self.learning = False
        self.reset(0)
        self.reset(1)

        print("total_time", t)
        print("total_win_rate_A", sum(win_historyA) / len(win_historyA))
        print("average rewards per episode_A",
              sum(total_reward_listA) / len(total_reward_listA))
        print("total_win_rate_B", sum(win_historyB) / len(win_historyB))
        print("average rewards per episode_B",
              sum(total_reward_listB) / len(total_reward_listB))
        plt.figure()
        plt.title('Rewards per Episode')
        plt.xlabel('Episode number')
        plt.ylabel('Rewards')
        plt.plot(total_reward_listA, label='agentA')
        plt.plot(total_reward_listB, label='agentB')
        plt.legend(loc='upper right')
        plt.show()

        plt.figure()
        plt.title('Average Rewards over 100 Episode')
        plt.xlabel('Episode number')
        plt.ylabel('Rewards')
        plt.plot(avg_reward_listA, label='agentA')
        plt.plot(avg_reward_listB, label='agentB')
        plt.legend(loc='upper right')
        plt.show()

    def choose_best_action(self, state, terminal):
        if terminal == self.cv.coords(self.targetA):
            q_table = self.q_tableA

        if terminal == self.cv.coords(self.targetB):

            q_table = self.q_tableB

        state_action = q_table.loc[state]

        action = np.random.choice(
            state_action[state_action == np.max(state_action)].index)
        return int(action)

    def new_reward_a(self, s_, B_s_, s, s_B):
        # reward function
        self.targetA = self.selected_targets[0]
        if s_ == self.cv.coords(self.selected_targets[0]):
            t = self.cv.create_image(s_, image=self.cv.arriveImage[0])
            self.createMarkA = t
            reward = 0

        elif s_ in self.selected_Obstacles_position:
            reward = -2

        elif s_ == B_s_:
            reward = -2

        elif s_ == s_B and B_s_ == s:
            reward = -2

        else:
            reward = 0

        return reward

    def new_reward_b(self, s_, A_s_, s, s_A):
        self.targetB = self.selected_targets[1]
        if s_ == self.cv.coords(self.selected_targets[1]):
            t = self.cv.create_image(s_, image=self.cv.arriveImage[0])
            self.createMarkB = t
            reward = 0

        elif s_ in self.selected_Obstacles_position:
            reward = -2

        elif s_ == A_s_:
            reward = -2

        elif s_ == s_A and A_s_ == s:
            reward = -2
        else:
            reward = 0
        return reward

    def run(self):
        """
        main function for running tests
        """
        test = 0
        rewardsA = []
        rewardsB = []
        action_B = -1
        action_A = -1
        UNIT = self.grid_UNIT
        observation_A = self.cv.coords(self.agentA)
        observation_B = self.cv.coords(self.agentB)
        doneA = 0
        doneB = 0
        total_rewardA = 0
        total_rewardB = 0
        win_countA = 0
        win_countB = 0
        terminal_A = self.cv.coords(self.targetA)
        terminal_B = self.cv.coords(self.targetB)
        visitedA = [observation_A]
        visitedB = [observation_B]
        enhance_list = []
        win_listA = []
        win_listB = []
        while True:
            if self.cv.coords(self.agentA) == self.cv.coords(self.targetA):
                doneA = 1
            if self.cv.coords(self.agentB) == self.cv.coords(self.targetB):
                doneB = 1
            self.labelHello = Label(self.cv,
                                    text="Test:%s" % str(test),
                                    font=("Helvetica", 10),
                                    width=10,
                                    fg="blue",
                                    bg="white")
            self.labelHello.place(x=self.agentCentre[0][0] - 150,
                                  y=self.agentCentre[0][1] + 500,
                                  anchor=NW)
            time.sleep(self.timeDelay)
            distance = (observation_A[0] - observation_B[0])**2 + (
                observation_A[1] - observation_B[1])**2
            if distance == UNIT**2 or distance == 2 * (UNIT**2):
                if action_A == -1:
                    observation_B_new = observation_B
                else:
                    observation_B_new = []
                    observation_B_new.append(action_A)
                    observation_B_new.append(observation_A)
                    observation_B_new.append(observation_B)
                if action_B == -1:
                    observation_A_new = observation_A
                else:
                    observation_A_new = []
                    observation_A_new.append(action_B)
                    observation_A_new.append(observation_B)
                    observation_A_new.append(observation_A)
            else:
                observation_B_new = observation_B
                observation_A_new = observation_A

            if doneA != 1:
                try:
                    action_A = self.choose_best_action(str(observation_A_new),
                                                       terminal_A)
                    A_observation_ = self.calcu_next_state(
                        observation_A, action_A)
                    if A_observation_ == self.cv.coords(self.targetA):
                        win_listA.append(1)
                except KeyError:
                    doneA = 1
                    self.reset(0)
                    pass
            if doneB != 1:
                try:
                    action_B = self.choose_best_action(str(observation_B_new),
                                                       terminal_B)
                    B_observation_ = self.calcu_next_state(
                        observation_B, action_B)
                    if B_observation_ == self.cv.coords(self.targetB):
                        win_listB.append(1)
                except KeyError:
                    doneB = 1
                    self.reset(1)
                    pass

            reward_A = self.new_reward_a(A_observation_, B_observation_,
                                         observation_A, observation_B)
            reward_B = self.new_reward_b(B_observation_, A_observation_,
                                         observation_B, observation_A)

            if B_observation_ in visitedB:
                reward_B -= 0.5
            else:
                visitedB.append(B_observation_)
            if A_observation_ in visitedA:
                reward_A -= 0.5
            else:
                visitedA.append(A_observation_)

            if doneA:
                A_observation_ = self.cv.coords(self.targetA)
            if doneB:
                B_observation_ = self.cv.coords(self.targetB)
            self.real_step(A_observation_, B_observation_)
            total_rewardA += reward_A
            total_rewardB += reward_B

            if total_rewardA < -1:
                doneA = 1
                enhance_list.append(visitedA[0])
                win_countA += 1
            if total_rewardB < -1:
                doneB = 1
                enhance_list.append(visitedB[0])
                win_countB += 1
            if doneA != 1:
                lineA = self.cv.create_line(
                    observation_A[0],
                    observation_A[1],
                    A_observation_[0],
                    A_observation_[1],
                    fill='red',
                    arrow=LAST,
                    arrowshape=(10, 20, 8),  # 红色
                    dash=(4, 4)  # 虚线
                )
                self.linesA.append(lineA)
            if doneB != 1:
                lineB = self.cv.create_line(
                    observation_B[0],
                    observation_B[1],
                    B_observation_[0],
                    B_observation_[1],
                    fill='blue',
                    arrow=LAST,
                    arrowshape=(10, 20, 8),  # 红色
                    dash=(4, 4)  # 虚线
                )
                self.linesB.append(lineB)
            observation_A = A_observation_
            observation_B = B_observation_
            if doneA:
                action_A = -1
                visitedA = []
            if doneB:
                action_B = -1
                visitedB = []
            if doneA and doneB:
                total_rewardA += 1
                total_rewardB += 1
                rewardsA.append(total_rewardA)
                rewardsB.append(total_rewardB)
                total_rewardA = 0
                total_rewardB = 0
                self.reset(0)
                self.reset(1)
                doneA = 0
                doneB = 0
                observation_A = self.cv.coords(self.agentA)
                observation_B = self.cv.coords(self.agentB)
                test += 1
            if test > self.tests:
                break
        plt.figure()
        plt.title('Score per Episode')
        plt.xlabel('Episode number')
        plt.ylabel('Score')
        plt.plot(rewardsA, label='agentA')
        plt.plot(rewardsB, label='agentB')
        plt.legend(loc='upper right')
        #        print(rewardsA)
        #        print(rewardsB)
        plt.show()
        print("win_countA:", sum(win_listA))
        print("win_countB:", sum(win_listB))

    def start_learning(self):
        self.selected_agent = []
        self.selected_targets = []
        self.selected_Obstacles = []
        self.selected_agent_position = []
        self.selected_Obstacles_position = []

        for item in range(1, self.itemsNum + 1):
            p = self.cv.coords(item)
            if p[0]>=self.grid_origx and p[1]>=self.grid_origy \
                and p[0]<=self.grid_endx and p[1]<=self.grid_endy:
                if item in range(self.agentItemIndex[0],
                                 self.agentItemIndex[1] + 1):
                    self.selected_agent.append(item)
                    self.selected_agent_position.append(p)
                elif item in range(self.WarehouseItemIndex[0],
                                   self.WarehouseItemIndex[1] + 1):
                    self.selected_targets.append(item)
                elif item in range(self.ObstacleItemIndex[0],
                                   self.ObstacleItemIndex[1] + 1):
                    self.selected_Obstacles.append(item)
                    self.selected_Obstacles_position.append(p)

        self.agentA = self.selected_agent[0]
        self.agentB = self.selected_agent[1]
        self.targetA = self.selected_targets[0]
        self.targetB = self.selected_targets[1]
        if len(self.selected_agent) == 0 or len(self.selected_agent) > 2:
            tkinter.messagebox.showinfo(
                "INFO", "Please choose TWO agent for trainning！")
        elif len(self.selected_targets) == 0 or len(self.selected_targets) > 2:
            tkinter.messagebox.showinfo(
                "INFO", "Please choose TWO target for trainning！")
        else:
            self.t = threading.Timer(self.timeDelay, self.update)
            self.t.start()
            self.learning = True

    def restart(self):
        self.cv.coords(self.agentA, self.agentCentre[0])
        self.cv.coords(self.agentB, self.agentCentre[1])

    def calcu_next_state(self, loc, action):
        """
        calculate next state based on location and action
        """
        UNIT = self.grid_UNIT
        ss = loc
        np_s = np.array(ss)
        dissS = np.array([self.grid_origx, self.grid_origy])
        s = (np_s - dissS).tolist()
        base_action = np.array([0, 0])
        if action == 0:  # up
            if s[1] > UNIT:
                base_action[1] -= UNIT
        elif action == 1:  # down
            if s[1] < (self.grid_rowNum - 1) * UNIT:
                base_action[1] += UNIT
        elif action == 2:  # right
            if s[0] < (self.grid_columnNum - 1) * UNIT:
                base_action[0] += UNIT
        elif action == 3:  # left
            if s[0] > UNIT:
                base_action[0] -= UNIT
        s_ = []
        s_ = [ss[0] + base_action[0], ss[1] + base_action[1]]
        return s_

    def render(self):
        time.sleep(self.timeDelay)

    def create_items(self):
        self.AllItemsOrigPosition_list.append([0, 0, 0, 0])
        self.create_agentItems()
        self.agentItemIndex = [1, len(self.agentPosition_list)]
        self.create_targetItems()
        self.WarehouseItemIndex = [
            self.agentItemIndex[1] + 1,
            self.agentItemIndex[1] + len(self.warehousePostition_list)
        ]
        self.create_ObsItems()
        self.ObstacleItemIndex = [
            self.WarehouseItemIndex[1] + 1,
            self.WarehouseItemIndex[1] + len(self.ObstaclePosition_list)
        ]

    def create_grids(self, origx, origy, column, row, UNIT):
        # create grids
        for c in range(origx, origx + (column + 1) * UNIT, UNIT):
            x0, y0, x1, y1 = c, origy, c, origy + row * UNIT
            self.cv.create_line(x0, y0, x1, y1, width=2)
        for r in range(origy, origy + (row + 1) * UNIT, UNIT):
            x0, y0, x1, y1 = origx, r, origx + row * UNIT, r
            self.cv.create_line(x0, y0, x1, y1, width=2)

    def choose_type(self, i):
        """
        function of clicking different button
        """
        for b in self.bns:
            b['relief'] = RAISED
        self.bns[i]['relief'] = SUNKEN
        self.item_type = i
        if self.item_type == 1:
            #            start training
            self.start_learning()
            self.bns[i]['relief'] = RAISED
        elif self.item_type == 2:
            #            close simulation tool
            os._exit(0)
        elif self.item_type == 3:
            #           save q_table
            temp_s = str(self.cv.coords(self.target)) + str(
                self.selected_Obstacles_position)
            self.RL.q_table.to_csv("single_qtable_%s.csv" % temp_s,
                                   index_label="index_label")
            print("SAVED!!!")
            self.labelHello = Label(self.cv,
                                    text="table saved!!",
                                    font=("Helvetica", 10),
                                    width=10,
                                    fg="red",
                                    bg="white")
            self.labelHello.place(x=350, y=750, anchor=NW)
        elif self.item_type == 0:
            self.button_reset()
        elif self.item_type == 4:
            #            start running tests
            self.start_running()
        elif self.item_type == 5:
            self.restart()

    def start_running(self):
        self.selected_agent = []
        self.selected_targets = []
        self.selected_Obstacles = []
        self.selected_agent_position = []
        self.selected_Obstacles_position = []
        self.task_list = []
        self.task_num_list = []
        for item in range(1, self.itemsNum + 1):
            p = self.cv.coords(item)
            if p[0] >= self.grid_origx and p[1] >= self.grid_origy:
                if item in range(self.agentItemIndex[0],
                                 self.agentItemIndex[1] + 1):
                    self.selected_agent.append(item)
                    self.selected_agent_position.append(p)
                elif item in range(self.WarehouseItemIndex[0],
                                   self.WarehouseItemIndex[1] + 1):
                    self.selected_targets.append(item)
                elif item in range(self.ObstacleItemIndex[0],
                                   self.ObstacleItemIndex[1] + 1):
                    self.selected_Obstacles.append(item)
                    self.selected_Obstacles_position.append(p)

        if len(self.selected_agent) <= 1 or len(self.selected_agent) > 2:
            tkinter.messagebox.showinfo("Please place TWO agent on map!")
        elif len(self.selected_targets) == 0 or len(self.selected_targets) > 2:
            tkinter.messagebox.showinfo("Please choose TWO terminal!")
        else:
            self.agentA = self.selected_agent[0]
            self.agentB = self.selected_agent[1]
            self.targetA = self.selected_targets[0]
            self.targetB = self.selected_targets[1]
            terminalA = self.cv.coords(self.targetA)
            terminalB = self.cv.coords(self.targetB)
            terminal_strA = str(terminalA) + str(self.episode)
            terminal_strB = str(terminalB) + str(self.episode)
            self.task_list = []
            self.q_tableA = pd.read_csv("table terminal%s.csv" % terminal_strA,
                                        index_col=0)
            self.q_tableB = pd.read_csv("table terminal%s.csv" % terminal_strB,
                                        index_col=0)
            self.labelHello = Label(self.cv,
                                    text="start running!!",
                                    font=("Helvetica", 10),
                                    width=10,
                                    fg="red",
                                    bg="white")
            self.labelHello.place(x=250, y=750, anchor=NW)
            self.t = threading.Timer(self.timeDelay, self.run)
            self.t.start()
            self.learning = True

    def rightClick_handler(self, event):
        self.start_learning()

    def leftClick_handler(self, event):
        if self.learning:
            print("Learing on going!")
        else:
            for item in range(1, self.itemsNum + 1):
                position = self.cv.coords(item)
                R = self.grid_UNIT / 2
                p = [
                    position[0] - R, position[1] - R, position[0] + R,
                    position[1] + R
                ]
                if event.x>=p[0] and event.x<=p[2] and \
                    event.y>=p[1] and event.y<=p[3]:
                    t = item
                    self.choose_item_handler(event, t)

    def choose_item_handler(self, event, t):
        self.choose_item = t
        self.itemOrigPosition = self.cv.coords(t)

    def move(self, event):
        if self.choose_item is not None:
            t = self.choose_item
            self.cv.coords(t, event.x, event.y)

    def adjust_items_into_grids(self, event):
        if self.choose_item is not None:
            t = self.choose_item
            position = self.cv.coords(t)
            centerX = position[0]
            centerY = position[1]
            Grids_X0 = self.grid_origx
            Grids_X1 = self.grid_origx + (self.grid_columnNum +
                                          1) * self.grid_UNIT
            Grids_Y0 = self.grid_origy
            Grids_Y1 = self.grid_origy + (self.grid_rowNum +
                                          1) * self.grid_UNIT
            if (centerX in range(Grids_X0, Grids_X1)) and (centerY in range(
                    Grids_Y0, Grids_Y1)):
                columnIndex = math.floor((centerX - Grids_X0) / self.grid_UNIT)
                rowIndex = math.floor((centerY - Grids_Y0) / self.grid_UNIT)
                adjustedX0 = Grids_X0 + columnIndex * self.grid_UNIT + self.grid_UNIT / 2
                adjustedY0 = Grids_Y0 + rowIndex * self.grid_UNIT + self.grid_UNIT / 2
                self.cv.coords(t, adjustedX0, adjustedY0)
            else:
                self.cv.coords(t, self.AllItemsOrigPosition_list[t])
                self.itemOrigPosition = []

    def move_end(self, event):
        if self.choose_item is not None:
            t = self.choose_item
            self.adjust_items_into_grids(event)
            self.choose_item = None

    def delete_item(self, event):
        # 如果被选中的item不为空，删除被选中的图形项
        if self.choose_item is not None:
            self.cv.delete(self.choose_item)