Esempio n. 1
0
def main():
    env = RideHitch("data/norm1000.txt")
    print(env.requests_list)
    RL = DeepQNetwork(env.pool_size,
                      env.state_num,
                      learning_rate=0.01,
                      reward_decay=0.99,
                      e_greedy=1,
                      replace_target_iter=200,
                      memory_size=2000,
                      output_graph=False,
                      T=env.T_threshold,
                      D=env.D_threshold)
    step = 0
    matched_list = []
    for episode in range(100):
        # init
        observation = env.reset(reset_seq=False)
        # if episode % 100 == 0:
        #     print(episode)
        matched = 0
        print("seq size:", env.request_num, "pool size:", env.pool_size)
        while True:
            action = RL.choose_action(observation)
            observation_, reward, done = env.step(action)
            if reward > 0:
                matched += 1
            RL.store_transition(observation, action, reward, observation_)
            if (step > 200) and (step % 5 == 0):
                RL.learn()
            observation = observation_
            if done:
                break
            step += 1
        matched_list.append(matched)
        print("eps", episode, "matching", matched)
    # print(matched_list)
    RL.plot_cost()
Esempio n. 2
0
        value = 0

        for step in range(STEP):
            state = env.state.copy()
            action = RL.choose_action(state)
            env.step(action_space[action])
            state_ = env.state.copy()
            reward, dist = compute_reward(state, state_)

            RL.store_transition(state, action, reward, state_)
            value += reward
            if dist < DIST:
                break

            if RL.memory_counter > MEMORYCAPACITY:
                RL.learn()

        if (episode + 1) % 100 == 0:
            env.display2D()
            print episode + 1
            print value
            if (dist < DIST) & (dist > 0):
                print "Got Target"
            if dist < 0:
                print "Got Obstacle"
            if dist > DIST:
                print "Failed Target"
            print '*' * 40

        if (episode + 1) % 10000 == 0:
            RL.save_model()
Esempio n. 3
0
            if i >= 1:
                RL_farm.store_transition(observation_1_last, action1,
                                         reward_stage1_old, observation_1_new)
                RL_server[farm_id].store_transition(observation_2_last,
                                                    action2, reward_stage2_old,
                                                    observation_2_new)

            observation_1_last = observation_1_new
            observation_2_last = observation_2_new
            reward_stage1_old = reward_stage1_new
            reward_stage2_old = reward_stage2_new

            RL_server_n[server_id] += 1
            last_input_time = curr_task.input_time

            if (i > 200) and (i % 5 == 0):
                RL_farm.learn()

            if (RL_server_n[server_id] > 200) and (server_id % 5 == 0):
                RL_server[server_id].learn()

            if drop:
                print("task " + str(i) + " drop")
            else:
                print("task " + str(i) + ": farm " + str(farm_id) +
                      " server: " + str(server_id))
            print("----------------------")
            if i == 999:
                print("total_cost")
                print(total_cost.total_price(it))
Esempio n. 4
0
                  child2=child2,
                  child3=child3)
kind = 3
DQN = DeepQNetwork(double_q=False, dueling_q=False, env=str(envs))
steps = 0
for i in range(30):
    start = time.time()
    support.create_csv(Env.save_title, kind=kind, i=i + 1)
    s = Env.reset()
    while not Env.done:
        action = DQN.choose_actions(s)
        s_, r, done, advise = Env.step(action)
        DQN.store_transition(s, action, r, s_)
        s = s_
        if steps > 100:
            DQN.learn()
        Env.loss = DQN.cost
        steps += 1
        Env.steps += 1
        if Env.steps != 0:
            support.save_data2csv(Env.save_data, kind=kind, i=i + 1)
        if Env.steps >= max_steps:
            break
    support.save_data2csv_end(kind=kind, i=i + 1)
    support.save_fig(kind=kind, i=i + 1)
    DQN.store_results(
        support.root + support.child1[0] + support.child2[kind] +
        support.child3[1] + str(i + 1) + "th/" + "models", i + 1)
    DQN.store_graph(support.root + support.child1[0] + support.child2[kind] +
                    support.child3[2] + str(i + 1) + "th/")
    print(
Esempio n. 5
0
class LTDQN(Approach):
    def __init__(self,
                 budget,
                 times,
                 users,
                 n_scope,
                 r_interval=0.01,
                 isTrain=True):
        Approach.__init__(self, budget, times, users)
        self.n_scope = n_scope
        self.state_dim = 8
        self.action_dim = 9
        self.r_interval = r_interval
        if isTrain:
            self.dqn = DeepQNetwork(self.action_dim, self.state_dim)
        else:
            self.dqn = DeepQNetwork(self.action_dim,
                                    self.state_dim,
                                    e_greedy_increment=None)

    def generate_reward(self, action, user):
        if action == 1:
            user.default_single_r += self.r_interval
            if user.default_single_r > 1.:
                user.default_single_r = 1.
        elif action == 2:
            user.default_single_r -= self.r_interval
            if user.default_single_r < 0.:
                user.default_single_r = 0.
        elif action == 3:
            user.default_num += 1
            if user.default_num > self.n_scope:
                user.default_num = self.n_scope
        elif action == 4:
            user.default_num -= 1
            if user.default_num < 1:
                user.default_num = 1
        elif action == 5:
            user.default_single_r += self.r_interval
            if user.default_single_r > 1.:
                user.default_single_r = 1.
            user.default_num += 1
            if user.default_num > self.n_scope:
                user.default_num = self.n_scope
        elif action == 6:
            user.default_single_r += self.r_interval
            if user.default_single_r > 1.:
                user.default_single_r = 1.
            user.default_num -= 1
            if user.default_num < 1:
                user.default_num = 1
        elif action == 7:
            user.default_single_r -= self.r_interval
            if user.default_single_r < 0.:
                user.default_single_r = 0.
            user.default_num += 1
            if user.default_num > self.n_scope:
                user.default_num = self.n_scope
        elif action == 8:
            user.default_single_r -= self.r_interval
            if user.default_single_r < 0.:
                user.default_single_r = 0.
            user.default_num -= 1
            if user.default_num < 1:
                user.default_num = 1

    def simulate(self):
        self.dqn.load()
        for ep in range(1):
            # self.users = self.init_users_list()
            total_benefits = 0.
            total_expense = 0.
            for time in range(self.times):
                total_affected_num = 0
                total_req_num = 0.
                for user in self.users:
                    if user.finished == 0:
                        if self.budget > 0:
                            output = self.dqn.choose_action(user.state)
                            self.generate_reward(output, user)
                            # if user.default_single_r >= 0.5:
                            user.receive_offer(user.default_single_r,
                                               user.default_num, output)
                            # else:
                            #     user.receive_offer(0, user.default_num, output)
                            self.budget -= user.r
                        else:
                            user.receive_offer(0., 0, 0)

                    total_req_num += user.req_num
                    action, benefits, reward, done = user.choose_action()

                    if done:
                        if user.finished == 0:
                            self.budget += user.r
                        user.reset_status()
                        # self.dqn.store_transition(user.state, action, reward, user.state_)
                        user.state = user.state_.copy()
                        # self.dqn.learn()
                    if user.action == len(user.preference) - 1:
                        total_affected_num += 1
                    if benefits > 0:
                        total_benefits += benefits
                    total_expense += benefits / (1. - benefits + 0.001)
                if (time + 1) % self.interval == 0:
                    self.affected_users_num.append(total_affected_num)
                    self.total_benefits.append(total_benefits)
                    self.average_req_num.append(total_req_num /
                                                len(self.users))
                    self.ratio.append(total_expense)
                print(
                    "\rEpisode: %d, Time step: %d, Budget: %f, Benefits: %f" %
                    (ep, time, self.budget, total_benefits),
                    end=' ')
            print()

    def init_users_list(self):
        user_list = []
        arr = np.loadtxt('../dataset/test.txt', delimiter=' ')
        # arr = arr[0:2000, :]  # train
        # print(arr)
        total_cost = 0.
        for row in range(arr.shape[0]):
            data = arr[row, :]
            # print(data[0])
            user = User(row, float(data[0]), data[1:])
            total_cost += user.preference[np.argmax(
                user.preference)] - user.preference[-1]
            user_list.append(user)
        print(len(user_list), total_cost)
        return user_list

    def train(self):
        for ep in range(500):
            self.budget = 50000
            self.users = self.init_users_list()
            # self.users = self.init_users_list()
            self.dqn.epsilon = 0
            total_benefits = 0.
            for time in range(self.times):
                total_affected_num = 0
                total_req_num = 0.
                for user in self.users:
                    if user.finished == 0:
                        if self.budget > 0:
                            output = self.dqn.choose_action(user.state)
                            self.generate_reward(output, user)
                            user.receive_offer(user.default_single_r,
                                               user.default_num, output)
                            self.budget -= user.r
                        else:
                            user.receive_offer(0., 0, 0)

                    total_req_num += user.req_num
                    action, benefits, reward, done = user.choose_action()

                    if done:
                        if user.finished == 0:
                            self.budget += user.r
                        user.reset_status()
                        # print(user.state_, user.state)
                        self.dqn.store_transition(user.state, action, reward,
                                                  user.state_)
                        user.state = user.state_.copy()
                        self.dqn.learn()

                    if user.action == len(user.preference) - 1:
                        total_affected_num += 1
                    if benefits > 0:
                        total_benefits += benefits
                if (time + 1) % self.interval == 0:
                    self.affected_users_num.append(total_affected_num)
                    self.total_benefits.append(total_benefits)
                    self.average_req_num.append(total_req_num /
                                                len(self.users))
                print(
                    "\rEpisode: %d, Time step: %d, Budget: %f, Benefits: %f" %
                    (ep, time, self.budget, total_benefits),
                    end=' ')
                if self.budget <= 0:
                    break
            print()
            self.dqn.save()
Esempio n. 6
0
class view(tkinter.Tk):
    def __init__(self):
        self.gameStart=False
        self.status=False
        self.reward=0
        super(view, self).__init__()
        self.n_actions = 361    #定义动作的可能个数
        self.n_features = 361
        self.doneList=[]
        self.allphoto=[]
        self.initView()
        self.env=env()
        self.wobservation=None
        self.wobservation_=None
        self.action1=None
        self.RL = DeepQNetwork(self.n_actions, self.n_features )

    def callback(self,event):
        if self.gameStart:
            mouse_x = event.x
            mouse_y = event.y
            if 590 > mouse_x > 20 and 590 > mouse_y > 20:
                # 横向为a,纵向为b
                a = round((mouse_x - 40) / 30)
                b = round((mouse_y - 40) / 30)
                action = b * 19 + a
                # self.env.qipan[b, a] = 2,非计算机方
                observation =self.getdouble(np.reshape(np.copy(self.env.qipan), [1, space]))
                bobservation=self.transfore(observation)
                qipan,observation_, reward, done=self.step(action, 'Black')
                bobservation_=self.transfore(observation_)
                print('人工下棋的reward:%d'%reward)
                self.RL.store_transition(bobservation, action, reward*1.5, bobservation_) #此处默认人的掷棋是最优的
                if done:
                    tkinter.messagebox.showinfo(title='提示', message='you win!!!1')
                    self.RL.learn(flag=2)
                    self.RL.saveavarriable()
                    self.RL.plot_cost()
                    self.gameStart=False
                # self.status = True
                #计算机选择动作
                self.bqipan=np.copy(self.env.qipan)
                wobservation = self.getdouble(np.reshape(self.bqipan,[1,space]))
                action1= self.RL.choose_action(self.bqipan,wobservation)     #这里让电脑选择下一步下
                bqipan_,wobservation_,reward,done=self.step(action1,'White')
                print('计算机下棋的reward:%d'%reward)
                self.RL.store_transition(observation, action, reward, observation_)
                if done:
                    tkinter.messagebox.showinfo(title='提示', message='you failure')
                    self.RL.saveavarriable()
                    self.RL.plot_cost()
                    self.gameStart = False

    def initView(self):
        def buttonCallBack():
            self.RL.getvarriable()
            self.gameStart = True
            if len(self.allphoto) > 0:

                for i in self.allphoto:
                    self.w.delete(i)

            self.allphoto.clear()
            self.doneList.clear()
            observation = self.env.reset()

        self.master = Tk()
        self.master.title("五子棋")
        self.master.resizable(width=False, height=False)
        self.w = Canvas(self.master, bg="#FFFFF0", width=700, height=630)
        for c in range(40, 610, 30):  # 竖向
            x0, y0, x1, y1 = c, 40, c, 580
            self.w.create_line(x0, y0, x1, y1)
        for r in range(40, 610, 30):
            x0, y0, x1, y1 = 40, r, 580, r
            self.w.create_line(x0, y0, x1, y1)
        Label(self.w, text=1, bg="#FFFFF0").place(x=5, y=5)
        x1 = 60
        y1 = 5
        for i in range(2, 20):
            Label(self.w, text=i, bg="#FFFFF0").place(x=x1, y=y1)
            x1 += 30
        x1 = 5
        y1 = 60
        for i in range(2, 20):
            Label(self.w, text=i, bg="#FFFFF0").place(x=x1, y=y1)
            y1 += 30
        Button(self.w, text="开始游戏", bg="yellow", activebackground="Black", command=buttonCallBack).place(x=610, y=500)
        self.w.bind("<Double-Button-1>", self.callback)
        self.w.pack()
        #self.master.mainloop()


    def show(self,action,flag):
        y=(action//19)*30+40
        x=(action%19)*30+40
        if flag=='Black':
            a=self.w.create_oval(x-14,y-14,x+14,y+14,fill="Black")
        elif flag=='White':
            a = self.w.create_oval(x-14, y-14, x+14, y+14, fill="White")
        self.allphoto.append(a)
        self.update()

    def setPosition(self,action,flag):
        if action in self.doneList:
            tkinter.messagebox.showinfo(title='提示', message='当前位置不可下')

        else:
            self.doneList.append(action)
            self.show(action,flag)

    def reset(self):
        if len(self.allphoto)>0:

            for i in self.allphoto:
                self.w.delete(i)
        self.allphoto.clear()
        self.doneList.clear()
        self.gameStart=False
        observation=self.env.reset()
        ob=self.getdouble(np.reshape(observation,[1,space]))
        return np.copy(self.env.qipan),ob


    #############################################
    def step(self,action,flag):
        # 根据不同的掷棋方,返回reward
        # print(flag)
        # print('ation:%d'%action)
        p1 = self.env.pwb(flag)
        p2 = self.env.pwn(action, flag)  # 走完后赢的可能性

        # print('落子前所得分数%d'%p1)
        # print('落子后所得分数%d'%p2)
        s=p2-p1
        # if s<=0:
        #     self.reward=0
        # elif 0<s<150:
        #     self.reward=300
        # elif 150<=s<800:
        #     self.reward=500
        # elif 800<=s<3500:
        #     self.reward=2000
        # elif 3500<=s<4800:
        #     self.reward=4000
        # elif s>4800:
        #     self.reward=6000

        print("该步的回报值:%d"%s)

        self.setPosition(action,flag)
        if(s==-120):
            time.sleep(10000)
        qipan=self.getdouble(np.reshape(np.copy(self.env.qipan),[1,space]))
        return np.copy(self.env.qipan),qipan,s,self.env.done


    def tryPosition(self,Ob,ation,flag):
         qipan=np.copy(Ob)
         if flag=='White':
             qipan[0,ation]=1
         else:
             qipan[0,ation]=2
         return qipan


    def render(self):
        self.update()

    def transfore(self,observation):
        # print(np.shape(shape)[1])
        s1=observation[0,:space]
        s2=observation[0,space:]
        s=np.hstack((s1,s2))
        return s

    #将棋盘1*361转化为1*722形式
    def getdouble(self,qipan):
        w_qipan=np.zeros([1,space])
        b_qipan=np.zeros([1,space])
        w_array=np.where(qipan==1)[1]
        b_array=np.where(qipan==2)[1]
        w_qipan[0,w_array]=1
        b_qipan[0,b_array]=1
        s=np.hstack((w_qipan,b_qipan))  #转化为1*722矩阵,前361是白字的状态,后361是黑子的状态
        return s
Esempio n. 7
0
class Trainer(object):
    def __init__(self):
        start_table = dict()
        end_table = dict()
        self.RL = DeepQNetwork(n_actions,
                               n_features,
                               learning_rate=0.01,
                               reward_decay=0.9,
                               e_greedy=0.9,
                               replace_target_iter=200,
                               memory_size=2000,
                               output_graph=False,
                               testing=False)

        filename = "test_destinations.txt"
        f = open(filename, "r")

        for line in f:
            nums = line.split(';')
            start = nums[0].split(',')
            end_ = nums[1].split(',')

            start = [0, 0]
            end = [0, 0]
            start[0] = int(start[0])
            start[1] = int(start[1])
            end[0] = int(end_[0])
            end[1] = int(end_[1])

            start_table[start[0]] = start[1]
            end_table[end[0]] = end[1]

        # Training Time keeping
        total_time = 0
        start = time.time()

        # train on 25 samples
        self.run_training(150, start_table, end_table)

        # Training Time keeping
        total_time = (time.time() -
                      start) / 60  # print minutes to train on 100 samples
        time_file = "trainTime.txt"
        f = open(time_file, "w+")
        f.write(str(total_time))
        f.close()

    def run_training(self, training_samples, start_table, end_table):
        # Train over multiple instances
        map_file = np.loadtxt('map.txt', dtype=int)
        # bounding negative values for keeping it in bounds
        map_file[0, :] = MIN_VALUE
        map_file[:, 0] = MIN_VALUE
        map_file[:, len(map_file) - 1] = MIN_VALUE
        map_file[len(map_file) - 1, :] = MIN_VALUE

        for sample_x in range(training_samples):
            start = [
                random.randint(1, IMG_SIZE - 1),
                random.randint(1, IMG_SIZE - 1)
            ]
            end = [
                random.randint(1, IMG_SIZE - 1),
                random.randint(1, IMG_SIZE - 1)
            ]

            # query dictionary
            start_ = start_table.get(start[0], -1)
            end_ = end_table.get(end[0], -1)

            # ensure different than test cases
            while (start_ == start[1] and end_ == end[1]):
                start = [
                    random.randint(1, IMG_SIZE - 1),
                    random.randint(1, IMG_SIZE - 1)
                ]
                end = [
                    random.randint(1, IMG_SIZE - 1),
                    random.randint(1, IMG_SIZE - 1)
                ]
                start_ = start_table.get(start[0], -1)
                end_ = end_table.get(end[0], -1)

            total_epochs = 300

            # UAV map emulation
            env = Map(start, end, sample_x, map_file, False)
            self.run_map(str(sample_x), env, total_epochs)

            print("Finished training", sample_x)
        print("done training")

        # Save model here

    def run_map(self, i, env, epochs):
        step = 0
        s = []
        for episode in range(epochs):
            print("starting epoch ", episode)
            # initial observation
            observation = env.reset(str(episode))
            count = 0
            while True:
                count += 1
                # RL choose action based on observation
                action = self.RL.choose_action(observation)

                # RL take action and get next observation and reward
                observation_, reward, done = env.step(action)

                self.RL.store_transition(observation, action, reward,
                                         observation_)

                if ((step > 200) and (step % 5 == 0)) or done:
                    self.RL.learn(done)

                # swap observation
                observation = observation_

                # break while loop when end of this episode
                if done:
                    break
                step += 1
            s.append(count)

        plt.plot(np.arange(len(s)), s)
        plt.ylabel('points to goal')
        plt.xlabel('training steps')

        folder = "../DQN_path/graphs/"
        figname = folder + i + "_figPtsv1.png"
        plt.savefig(figname)
        plt.clf()
Esempio n. 8
0
                HE_soc = []
                HP_soc = []
            else:
                print(time)
                Env_battery_update = Env_battery(final_data[0:time], HE_power_vector , HP_power_vector, HEcur, HPcur, speed[0:time])
                #RL = DeepQNetwork(Env_battery_update.n_actions, Env_battery_update.n_states, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000)
                action = RL.choose_action(observation)

                # RL take action and get next observation and reward
                observation_, reward, HE_power_vector, HP_power_vector, HEcur, HPcur = Env_battery_update.step(action, time-1)  #####time stamp of action
                print("reward", reward)
                reward_total = reward_total + reward
                
                RL.store_transition(observation, action, reward, observation_)
                
                cost = RL.learn()
                cost_total = cost_total + cost
                # swap observation
                observation = observation_
                HE_soc.append(observation[0]) 
                HP_soc.append(observation[1])
                # break while loop when end of this episode
    # Env_battery.mainloop()
        #RL.plot_cost() 
        cost_final.append(cost_total)
        reward_final.append(reward_total)
		
    plt.plot(np.arange(len(cost_final)),cost_final)
    plt.ylabel('Cost')
    plt.xlabel('Epoch ')
    plt.show()