def DQN(): import tensorflow as tf from DQN import DeepQNetwork import numpy as np game.restart_game() tf.reset_default_graph() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) dqn = DeepQNetwork(sess, game) game_state = game.current_state() start_state = np.concatenate( (game_state, game_state, game_state, game_state), axis=2) s_t = start_state while not game.game_end(): # choose an action epsilon greedily _, action_index = dqn.choose_action(s_t) move = action_index game.do_move(move) pygame.event.pump() game_state = game.current_state() s_t = np.append(game_state, s_t[:, :, :-2], axis=2) screen.fill(black) game.snake.blit(rect_len, screen) game.strawberry.blit(screen) game.blit_score(white, screen) pygame.display.flip() fpsClock.tick(15) crash()
def main(): env = RideHitch("data/norm1000.txt") print(env.requests_list) RL = DeepQNetwork(env.pool_size, env.state_num, learning_rate=0.01, reward_decay=0.99, e_greedy=1, replace_target_iter=200, memory_size=2000, output_graph=False, T=env.T_threshold, D=env.D_threshold) step = 0 matched_list = [] for episode in range(100): # init observation = env.reset(reset_seq=False) # if episode % 100 == 0: # print(episode) matched = 0 print("seq size:", env.request_num, "pool size:", env.pool_size) while True: action = RL.choose_action(observation) observation_, reward, done = env.step(action) if reward > 0: matched += 1 RL.store_transition(observation, action, reward, observation_) if (step > 200) and (step % 5 == 0): RL.learn() observation = observation_ if done: break step += 1 matched_list.append(matched) print("eps", episode, "matching", matched) # print(matched_list) RL.plot_cost()
learning_rate=0.0001, reward_decay=0.9, e_greedy=0.75, replace_target_iter=2000, memory_size=MEMORYCAPACITY, batch_size=64 # output_graph=True ) RL.restore_model() for episode in range(EPS): env.build_map() value = 0 for step in range(STEP): state = env.state.copy() action = RL.choose_action(state) env.step(action_space[action]) state_ = env.state.copy() reward, dist = compute_reward(state, state_) RL.store_transition(state, action, reward, state_) value += reward if dist < DIST: break if RL.memory_counter > MEMORYCAPACITY: RL.learn() if (episode + 1) % 100 == 0: env.display2D() print episode + 1
for it in range(250): for i in range(1000): #read data curr_task = read_task(i, df_task_usage) env.update_env(last_input_time, curr_task.input_time) # if str(job_ID) not in jobs: # curr_job = env.Job(job_ID) # jobs[str(job_ID)] = cur_job #need a hashmap tp match the id to the job #curr_job.tasks.append(curr_task) observation_1_new = get_observation_1(curr_task, env) #stage_1 action1 = RL_farm.choose_action(observation_1_new) farm_id, waiting_time = env.step_farm(action1) curr_server_farm = env.serverfarms[farm_id] #stage_2 observation_2_new = get_observation_2(curr_task, curr_server_farm) action2 = RL_server[farm_id].choose_action(observation_2_new) server_id = int(action2) curr_server = curr_server_farm.servers[server_id] #check hard_deadline & cpu,memory curr_task.start_time = curr_task.input_time + waiting_time curr_task.end_time = curr_task.start_time + curr_task.execution_time print("----------------------") print("start_time : " + str(curr_task.start_time))
best_reward=0 best_pp=None reward_list=[] for episode in range(nEpisodes): # DQN observation, info = env.reset() # frame=info["frame"] ep_r = 0 while True: env.render() action = RL.choose_action(action_transform(observation)) observation_, reward, done, info = env.step(action) RL.store_transition(action_transform(observation), action, reward, action_transform(observation_)) ep_r += reward if total_steps > 1000: RL.learn() if done: print('episode: ', episode, 'ep_r: ', round(ep_r, 2), ' epsilon: ', round(RL.epsilon, 2)) break
replace_target_iter=100, memory_size=2000, e_greedy_increment=.001, ) total_steps = 0 for episode in range(1000): observation = env.reset() ep_r = 0 while True: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - .8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - .5 reward = r1 + r2 RL.store_transition(observation, action, reward, observation_) ep_r += reward if total_steps > 1000: RL.learn() if done: print("episode: ", episode, "ep_r",round(ep_r, 2),
class LTDQN(Approach): def __init__(self, budget, times, users, n_scope, r_interval=0.01, isTrain=True): Approach.__init__(self, budget, times, users) self.n_scope = n_scope self.state_dim = 8 self.action_dim = 9 self.r_interval = r_interval if isTrain: self.dqn = DeepQNetwork(self.action_dim, self.state_dim) else: self.dqn = DeepQNetwork(self.action_dim, self.state_dim, e_greedy_increment=None) def generate_reward(self, action, user): if action == 1: user.default_single_r += self.r_interval if user.default_single_r > 1.: user.default_single_r = 1. elif action == 2: user.default_single_r -= self.r_interval if user.default_single_r < 0.: user.default_single_r = 0. elif action == 3: user.default_num += 1 if user.default_num > self.n_scope: user.default_num = self.n_scope elif action == 4: user.default_num -= 1 if user.default_num < 1: user.default_num = 1 elif action == 5: user.default_single_r += self.r_interval if user.default_single_r > 1.: user.default_single_r = 1. user.default_num += 1 if user.default_num > self.n_scope: user.default_num = self.n_scope elif action == 6: user.default_single_r += self.r_interval if user.default_single_r > 1.: user.default_single_r = 1. user.default_num -= 1 if user.default_num < 1: user.default_num = 1 elif action == 7: user.default_single_r -= self.r_interval if user.default_single_r < 0.: user.default_single_r = 0. user.default_num += 1 if user.default_num > self.n_scope: user.default_num = self.n_scope elif action == 8: user.default_single_r -= self.r_interval if user.default_single_r < 0.: user.default_single_r = 0. user.default_num -= 1 if user.default_num < 1: user.default_num = 1 def simulate(self): self.dqn.load() for ep in range(1): # self.users = self.init_users_list() total_benefits = 0. total_expense = 0. for time in range(self.times): total_affected_num = 0 total_req_num = 0. for user in self.users: if user.finished == 0: if self.budget > 0: output = self.dqn.choose_action(user.state) self.generate_reward(output, user) # if user.default_single_r >= 0.5: user.receive_offer(user.default_single_r, user.default_num, output) # else: # user.receive_offer(0, user.default_num, output) self.budget -= user.r else: user.receive_offer(0., 0, 0) total_req_num += user.req_num action, benefits, reward, done = user.choose_action() if done: if user.finished == 0: self.budget += user.r user.reset_status() # self.dqn.store_transition(user.state, action, reward, user.state_) user.state = user.state_.copy() # self.dqn.learn() if user.action == len(user.preference) - 1: total_affected_num += 1 if benefits > 0: total_benefits += benefits total_expense += benefits / (1. - benefits + 0.001) if (time + 1) % self.interval == 0: self.affected_users_num.append(total_affected_num) self.total_benefits.append(total_benefits) self.average_req_num.append(total_req_num / len(self.users)) self.ratio.append(total_expense) print( "\rEpisode: %d, Time step: %d, Budget: %f, Benefits: %f" % (ep, time, self.budget, total_benefits), end=' ') print() def init_users_list(self): user_list = [] arr = np.loadtxt('../dataset/test.txt', delimiter=' ') # arr = arr[0:2000, :] # train # print(arr) total_cost = 0. for row in range(arr.shape[0]): data = arr[row, :] # print(data[0]) user = User(row, float(data[0]), data[1:]) total_cost += user.preference[np.argmax( user.preference)] - user.preference[-1] user_list.append(user) print(len(user_list), total_cost) return user_list def train(self): for ep in range(500): self.budget = 50000 self.users = self.init_users_list() # self.users = self.init_users_list() self.dqn.epsilon = 0 total_benefits = 0. for time in range(self.times): total_affected_num = 0 total_req_num = 0. for user in self.users: if user.finished == 0: if self.budget > 0: output = self.dqn.choose_action(user.state) self.generate_reward(output, user) user.receive_offer(user.default_single_r, user.default_num, output) self.budget -= user.r else: user.receive_offer(0., 0, 0) total_req_num += user.req_num action, benefits, reward, done = user.choose_action() if done: if user.finished == 0: self.budget += user.r user.reset_status() # print(user.state_, user.state) self.dqn.store_transition(user.state, action, reward, user.state_) user.state = user.state_.copy() self.dqn.learn() if user.action == len(user.preference) - 1: total_affected_num += 1 if benefits > 0: total_benefits += benefits if (time + 1) % self.interval == 0: self.affected_users_num.append(total_affected_num) self.total_benefits.append(total_benefits) self.average_req_num.append(total_req_num / len(self.users)) print( "\rEpisode: %d, Time step: %d, Budget: %f, Benefits: %f" % (ep, time, self.budget, total_benefits), end=' ') if self.budget <= 0: break print() self.dqn.save()
class view(tkinter.Tk): def __init__(self): self.gameStart=False self.status=False self.reward=0 super(view, self).__init__() self.n_actions = 361 #定义动作的可能个数 self.n_features = 361 self.doneList=[] self.allphoto=[] self.initView() self.env=env() self.wobservation=None self.wobservation_=None self.action1=None self.RL = DeepQNetwork(self.n_actions, self.n_features ) def callback(self,event): if self.gameStart: mouse_x = event.x mouse_y = event.y if 590 > mouse_x > 20 and 590 > mouse_y > 20: # 横向为a,纵向为b a = round((mouse_x - 40) / 30) b = round((mouse_y - 40) / 30) action = b * 19 + a # self.env.qipan[b, a] = 2,非计算机方 observation =self.getdouble(np.reshape(np.copy(self.env.qipan), [1, space])) bobservation=self.transfore(observation) qipan,observation_, reward, done=self.step(action, 'Black') bobservation_=self.transfore(observation_) print('人工下棋的reward:%d'%reward) self.RL.store_transition(bobservation, action, reward*1.5, bobservation_) #此处默认人的掷棋是最优的 if done: tkinter.messagebox.showinfo(title='提示', message='you win!!!1') self.RL.learn(flag=2) self.RL.saveavarriable() self.RL.plot_cost() self.gameStart=False # self.status = True #计算机选择动作 self.bqipan=np.copy(self.env.qipan) wobservation = self.getdouble(np.reshape(self.bqipan,[1,space])) action1= self.RL.choose_action(self.bqipan,wobservation) #这里让电脑选择下一步下 bqipan_,wobservation_,reward,done=self.step(action1,'White') print('计算机下棋的reward:%d'%reward) self.RL.store_transition(observation, action, reward, observation_) if done: tkinter.messagebox.showinfo(title='提示', message='you failure') self.RL.saveavarriable() self.RL.plot_cost() self.gameStart = False def initView(self): def buttonCallBack(): self.RL.getvarriable() self.gameStart = True if len(self.allphoto) > 0: for i in self.allphoto: self.w.delete(i) self.allphoto.clear() self.doneList.clear() observation = self.env.reset() self.master = Tk() self.master.title("五子棋") self.master.resizable(width=False, height=False) self.w = Canvas(self.master, bg="#FFFFF0", width=700, height=630) for c in range(40, 610, 30): # 竖向 x0, y0, x1, y1 = c, 40, c, 580 self.w.create_line(x0, y0, x1, y1) for r in range(40, 610, 30): x0, y0, x1, y1 = 40, r, 580, r self.w.create_line(x0, y0, x1, y1) Label(self.w, text=1, bg="#FFFFF0").place(x=5, y=5) x1 = 60 y1 = 5 for i in range(2, 20): Label(self.w, text=i, bg="#FFFFF0").place(x=x1, y=y1) x1 += 30 x1 = 5 y1 = 60 for i in range(2, 20): Label(self.w, text=i, bg="#FFFFF0").place(x=x1, y=y1) y1 += 30 Button(self.w, text="开始游戏", bg="yellow", activebackground="Black", command=buttonCallBack).place(x=610, y=500) self.w.bind("<Double-Button-1>", self.callback) self.w.pack() #self.master.mainloop() def show(self,action,flag): y=(action//19)*30+40 x=(action%19)*30+40 if flag=='Black': a=self.w.create_oval(x-14,y-14,x+14,y+14,fill="Black") elif flag=='White': a = self.w.create_oval(x-14, y-14, x+14, y+14, fill="White") self.allphoto.append(a) self.update() def setPosition(self,action,flag): if action in self.doneList: tkinter.messagebox.showinfo(title='提示', message='当前位置不可下') else: self.doneList.append(action) self.show(action,flag) def reset(self): if len(self.allphoto)>0: for i in self.allphoto: self.w.delete(i) self.allphoto.clear() self.doneList.clear() self.gameStart=False observation=self.env.reset() ob=self.getdouble(np.reshape(observation,[1,space])) return np.copy(self.env.qipan),ob ############################################# def step(self,action,flag): # 根据不同的掷棋方,返回reward # print(flag) # print('ation:%d'%action) p1 = self.env.pwb(flag) p2 = self.env.pwn(action, flag) # 走完后赢的可能性 # print('落子前所得分数%d'%p1) # print('落子后所得分数%d'%p2) s=p2-p1 # if s<=0: # self.reward=0 # elif 0<s<150: # self.reward=300 # elif 150<=s<800: # self.reward=500 # elif 800<=s<3500: # self.reward=2000 # elif 3500<=s<4800: # self.reward=4000 # elif s>4800: # self.reward=6000 print("该步的回报值:%d"%s) self.setPosition(action,flag) if(s==-120): time.sleep(10000) qipan=self.getdouble(np.reshape(np.copy(self.env.qipan),[1,space])) return np.copy(self.env.qipan),qipan,s,self.env.done def tryPosition(self,Ob,ation,flag): qipan=np.copy(Ob) if flag=='White': qipan[0,ation]=1 else: qipan[0,ation]=2 return qipan def render(self): self.update() def transfore(self,observation): # print(np.shape(shape)[1]) s1=observation[0,:space] s2=observation[0,space:] s=np.hstack((s1,s2)) return s #将棋盘1*361转化为1*722形式 def getdouble(self,qipan): w_qipan=np.zeros([1,space]) b_qipan=np.zeros([1,space]) w_array=np.where(qipan==1)[1] b_array=np.where(qipan==2)[1] w_qipan[0,w_array]=1 b_qipan[0,b_array]=1 s=np.hstack((w_qipan,b_qipan)) #转化为1*722矩阵,前361是白字的状态,后361是黑子的状态 return s
class Trainer(object): def __init__(self): start_table = dict() end_table = dict() self.RL = DeepQNetwork(n_actions, n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, output_graph=False, testing=False) filename = "test_destinations.txt" f = open(filename, "r") for line in f: nums = line.split(';') start = nums[0].split(',') end_ = nums[1].split(',') start = [0, 0] end = [0, 0] start[0] = int(start[0]) start[1] = int(start[1]) end[0] = int(end_[0]) end[1] = int(end_[1]) start_table[start[0]] = start[1] end_table[end[0]] = end[1] # Training Time keeping total_time = 0 start = time.time() # train on 25 samples self.run_training(150, start_table, end_table) # Training Time keeping total_time = (time.time() - start) / 60 # print minutes to train on 100 samples time_file = "trainTime.txt" f = open(time_file, "w+") f.write(str(total_time)) f.close() def run_training(self, training_samples, start_table, end_table): # Train over multiple instances map_file = np.loadtxt('map.txt', dtype=int) # bounding negative values for keeping it in bounds map_file[0, :] = MIN_VALUE map_file[:, 0] = MIN_VALUE map_file[:, len(map_file) - 1] = MIN_VALUE map_file[len(map_file) - 1, :] = MIN_VALUE for sample_x in range(training_samples): start = [ random.randint(1, IMG_SIZE - 1), random.randint(1, IMG_SIZE - 1) ] end = [ random.randint(1, IMG_SIZE - 1), random.randint(1, IMG_SIZE - 1) ] # query dictionary start_ = start_table.get(start[0], -1) end_ = end_table.get(end[0], -1) # ensure different than test cases while (start_ == start[1] and end_ == end[1]): start = [ random.randint(1, IMG_SIZE - 1), random.randint(1, IMG_SIZE - 1) ] end = [ random.randint(1, IMG_SIZE - 1), random.randint(1, IMG_SIZE - 1) ] start_ = start_table.get(start[0], -1) end_ = end_table.get(end[0], -1) total_epochs = 300 # UAV map emulation env = Map(start, end, sample_x, map_file, False) self.run_map(str(sample_x), env, total_epochs) print("Finished training", sample_x) print("done training") # Save model here def run_map(self, i, env, epochs): step = 0 s = [] for episode in range(epochs): print("starting epoch ", episode) # initial observation observation = env.reset(str(episode)) count = 0 while True: count += 1 # RL choose action based on observation action = self.RL.choose_action(observation) # RL take action and get next observation and reward observation_, reward, done = env.step(action) self.RL.store_transition(observation, action, reward, observation_) if ((step > 200) and (step % 5 == 0)) or done: self.RL.learn(done) # swap observation observation = observation_ # break while loop when end of this episode if done: break step += 1 s.append(count) plt.plot(np.arange(len(s)), s) plt.ylabel('points to goal') plt.xlabel('training steps') folder = "../DQN_path/graphs/" figname = folder + i + "_figPtsv1.png" plt.savefig(figname) plt.clf()