def reset(self): if self.gif: self.save_gif() # If pixex input, we reset the image buffer with random states if self.pixel_input: self.env = game.GameState(1, False) for i in range(4): frame, r, d = self.env.frame_step([1, 0], render=self.render) self.frame_buffer.append(self.process(frame)) return np.transpose(self.frame_buffer, (1, 2, 0)) else: self.env = game.GameState(1, False) s, r, d = self.env.frame_step([1, 0], render=self.render) return s
def train(self, episode, batch_size=64, freq=100): self.batch_size = batch_size tqdm_e = tqdm(range(episode)) env = game.GameState() for i in tqdm_e: state = env.reset() cum_r = 0 done = False while not done: # STATUS = "explore" state_newaxis = state[np.newaxis, :] action = self.agent.e_greedy_action(state_newaxis) action_array = np.array([0, 0]) action_array[action] = 1 next_state, reward, done = env.step(action_array) action_onehot = to_categorical(action, self.n_action) ob = (state, reward, done, action_onehot, next_state) self.sampling_pool.add_to_buffer(ob) state = next_state cum_r += reward if (self.sampling_pool.get_size() > self.batch_size): self.train_agent() STATUS = "train" if i % freq == 0: self.agent.transfer_weights() STATUS = "transfer weights" self.cum_r.append(cum_r) if (i > 10000) & (not (i % 10000)): self.save_model(f"{i}-eps-.h5") tqdm_e.set_description("Score: " + str(cum_r) + "\n Status" + STATUS) tqdm_e.refresh() self.save_model(f"final-{i}-eps-.h5")
def test_Network(self): #打开游戏状态与模拟器进行通信 game_state = game.GameState() #获得第一个状态并将图像进行预处理 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 #与游戏交互一次 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) #开始训练 epsilon = 0 t = 0 while "flappy bird" != "angry bird": a_t = self.epsilon_greedy(s_t, 0.0) #运动动作,与游戏环境交互一次 x_t1_colored, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2) #往前推进一步 s_t = s_t1
def play_game(self): """ This method trains the model to flappy birds. TODO: Insert more docs """ #1. open up a game state to communicate with emulator flappy_bird = game.GameState() # get the first state by doing nothing. do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = flappy_bird.frame_step(do_nothing) #run the selected action and observed next state and reward self.current_state = self.pre_process_state(x_t) while True: action, action_index = self.get_action() next_state, reward, terminal = flappy_bird.frame_step(action) next_state = self.scale_down_image(next_state) next_state = next_state.reshape(1, next_state.shape[0], next_state.shape[1], 1) #1x84x84x1 #print(type(next_state)) self.experience_env(next_state, action_index, reward, terminal)
def train(self, episode, sampling_pool=sampling_pool): with graph.as_default(): tqdm_e = tqdm(range(episode)) for i in tqdm_e: env = game.GameState() state = env.reset() cum_r = 0 done = False while not done: state = im_processor(state) state_newaxis = state[np.newaxis, :] action = self.actor.explore(state_newaxis) action_array = np.array([0, 0]) action_array[action] = 1 next_state, reward, done = env.step(action_array) action_onehot = to_categorical(action, self.n_action) ob = (state, reward, done, action_onehot, next_state) sampling_pool.add_to_buffer(ob) state = next_state cum_r += reward self.update(sampling_pool) self.cum_r.append(cum_r) tqdm_e.set_description("Score: " + str(cum_r)) tqdm_e.refresh() if (i > 10000) & (not (i % 10000)): self.save_model(f"{i}-eps-.h5") del env self.save_model(f"final-{i}-eps-.h5")
def init_flappybird(): env = game.GameState() x_t, r_0, terminal = env.step(0) x_t = x_t.reshape(x_t.shape[1], x_t.shape[2]) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) return env, s_t
def __init__(self, model, path): """ self.net 实例化模型 self.path 模型保存路径 self.game_state 游戏状态 self.batch_s_t t时刻的图像像素的batch self.batch_s_t1 t1时刻的图像像素的batch self.batch_a_t t时刻的行动的batch self.batch_r t时刻对应的奖励的batch self.y_batch t时刻对应的奖励 加上 最大的模型输出值 × 系数,也就是对应的q-value self.s_t t时刻的图像像素 self.loss_data loss的数值 self.readout_t t时刻的模型输出 self.r_t t时刻对应的奖励 self.s_t1 t1时刻的图像像素 self.action_index 行动下标 self.t 步数,记录运行多少步 self.loss_function 损失函数 self.optimizers 优化器 self.epsilon 系数 self.D 数据队列 self.load(path) 加载模型 self.observe 观察步数 self.explore 探索步数 self.cuda 是否使用cuda :param model: """ self.net = model() self.path = path self.game_state = game.GameState() self.batch_s_t = np.zeros([Batch, Channel, Width, High]) self.batch_s_t1 = np.zeros([Batch, Channel, Width, High]) self.batch_a_t = np.zeros([Batch, Actions]) self.batch_r = np.zeros([Batch]) self.s_t = self.get_state() self.loss_data = 0 self.readout_t = None self.r_t = None self.s_t1 = None self.y_batch = np.zeros([Batch]) self.action_index = 0 self.t = 0 # self.death = 0 self.loss_function = nn.MSELoss() self.optimizers = optim.Adam(params=self.net.parameters(), lr=1e-8) self.epsilon = Initial_epsilon self.D = deque() self.load(path) self.observe = self.t + Observe self.explore = self.t + Explore self.cuda = False if torch.cuda.is_available(): self.cuda = True self.net = self.net.cuda() print("begin time", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
def train_Network(self,experience_buffer): #打开游戏状态与模拟器进行通信 game_state = game.GameState() #获得第一个状态并将图像进行预处理 do_nothing = np.zeros(ACTIONS) do_nothing[0]=1 #与游戏交互一次 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80,80)),cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t),axis=2) #开始训练 epsilon = INITIAL_EPSILON t= 0 while "flappy bird"!="angry bird": a_t = self.epsilon_greedy(s_t,epsilon=epsilon) #epsilon递减 if epsilon > FINAL_EPSILON and t>OBSERVE: epsilon -= (INITIAL_EPSILON-FINAL_EPSILON)/EXPLORE #运动动作,与游戏环境交互一次 x_t1_colored, r_t,terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 =np.reshape(x_t1,(80,80,1)) s_t1 = np.append(x_t1, s_t[:,:,:3],axis=2) #将数据存储到经验池中 experience = np.reshape(np.array([s_t,a_t,r_t,s_t1,terminal]),[1,5]) print("experience", r_t,terminal) experience_buffer.add_experience(experience) #在观察结束后进行训练 if t>OBSERVE: #采集样本 train_s, train_a, train_r,train_s_,train_terminal = experience_buffer.sample(BATCH) target_q=[] read_target_Q = self.sess.run(self.Q_,{self.obs_:train_s_}) for i in range(len(train_r)): if train_terminal[i]: target_q.append(train_r[i]) else: target_q.append(train_r[i]+GAMMA*np.max(read_target_Q[i])) print(target_q) #训练一次 self.sess.run(self.q_train_op, feed_dict={self.obs:train_s, self.action:train_a, self.Q_target:target_q}) #更新旧的目标网络 # if t%1000 == 0: self.sess.run(self.update_oldq_op) #往前推进一步 s_t = s_t1 t+=1 #每10000次迭代保存一次 if t%10000 == 0: self.save_model('saved_networks/',global_step=t) if t<=OBSERVE: print("OBSERVE",t) else: if t%1 == 0: print("train, steps",t,"/epsilon", epsilon,"/action_index",a_t, "/reward",r_t)
def play_game(options): """Play flappy bird with pretrained dqn model weight -- model file name containing weight of dqn best -- if the model is best or not """ model = QNetwork() if options.ckpt_path is None: print ('you should give weight file name.') return print ('load previous model weight: {}'.format(options.ckpt_path)) episode, epsilon = load_checkpoint(options.ckpt_path, model) if options.cuda: model = model.cuda() algorithm = DQN(model, optim, epsilon, options) algorithm.set_eval() bird_game = game.GameState() bird_game.FPS = 480 action = [1, 0] o, r, terminal = bird_game.frame_step(action) o = preprocess(o) rpm = ReplayMemory(1, options) rpm.append(o, action, r, terminal) start = time.time() fc = 0 score = 0 while True: prev_o, a, r, o, terminal = rpm.sample(1) # q = algorithm(o).cpu().detach().numpy()[0] score = max(score, bird_game.score) action = algorithm.get_optim_action(o) o, r, terminal = bird_game.frame_step(action) o = preprocess(o) # img = Image.fromarray((o*255).astype(np.uint8)).convert(mode='L') # img.save(f'{fc}-{r}-{q.argmax()}.png') # fc += 1 if terminal or score > options.max_score*2: break rpm.append(o, action, r, terminal) ela = time.time() - start print(f'Final Score {score}, FPS{bird_game.FPS}, {ela//60}m{ela%60}s') # if __name__ == "__main__": # main()
def reset(self): self.env = game.GameState(1, False) # Reset the frame buffer with FRAME_BUFFER_SIZE frames for _ in range(FRAME_BUFFER_SIZE): frame, r, done = self.env.frame_step(onehot(0)) self.frame_buffer.append(frame) return self._convert_process_buffer()
def init(): flappyBird = game.GameState() init_action = torch.IntTensor([1, 0]) init_observation, _, _ = flappyBird.frame_step(init_action) init_observation = preprocess(init_observation) brain = RL_Brain(init_observation, INITIAL_EPSILON, TRAIN) return brain, flappyBird
def play1(rl, score_graph_path, IMAGE_WIDTH, IMAGE_HEIGHT, finish_episode): from game import wrapped_flappy_bird as fb import numpy as np env = fb.GameState() # first action [1,0], choose do nothing do_nothing = np.zeros(rl.action_cnt) do_nothing[0] = 1 img, r_0, terminal = env.frame_step(do_nothing) # image preprocessing img = resize_gray_binary(img, IMAGE_WIDTH, IMAGE_HEIGHT) s_t = np.stack((img, img, img, img), axis=2) episode = 0 score_hist = [] while True: # rl choose action based on current state a_t = rl.choose_action(s_t) # rl take action and get next image and reward img, r_t, terminal = env.frame_step(a_t) if r_t == 1: rl.score_per_episode += 1 print(rl.score_per_episode) if terminal: episode += 1 rl.score_per_episode = round(rl.score_per_episode, 3) summary, summary_score = rl.sess.run( [rl.summary_score, rl.score], feed_dict={rl.score: rl.score_per_episode}) rl.writer.add_summary(summary, episode) score_hist.append(rl.score_per_episode) rl.score_per_episode = 0.0 if episode >= finish_episode: break img = resize_gray_binary(img, IMAGE_WIDTH, IMAGE_HEIGHT) img = np.reshape(img, (IMAGE_WIDTH, IMAGE_HEIGHT, 1)) s_t1 = np.append(img, s_t[:, :, :3], axis=2) # swap observation s_t = s_t1 max_score = max(score_hist) min_score = min(score_hist) aver_score = np.average(score_hist) std_deviation = np.std(score_hist) with open(score_graph_path + 'result.txt', 'w') as f: f.write('%s\n' % score_hist) f.write('max: %d\n' % max_score) f.write('min: %d\n' % min_score) f.write('average: %d\n' % aver_score) f.write('std deviation: %d\n' % std_deviation)
def train(self, episode): with graph.as_default(): tqdm_e = tqdm(range(episode)) env = game.GameState() s = deque() a = deque() r = deque() d = deque() next_s = deque() for i in tqdm_e: state = env.reset() cum_r = 0 done = False # state = np.squeeze(im_processor(state)) state_stack = np.stack([state for i in range(STACK_NUM)], axis=2) while not done: state_newaxis = state_stack[np.newaxis, :] action = self.actor.explore(state_newaxis) action_array = np.array([0, 0]) action_array[action] = 1 next_im, reward, done = env.step(action_array) # next_im = im_processor(next_im) next_state_stack = np.append(next_im, state_stack[..., :-1], axis=2) action_onehot = to_categorical(action, self.n_action) s.append(state_stack) a.append(action_onehot) r.append(reward) d.append(done) next_s.append(next_state_stack) state_stack = next_state_stack cum_r += reward self.cum_r.append(cum_r) tqdm_e.set_description("Score: " + str(cum_r)) tqdm_e.refresh() # train self.update(s, r, d, a, next_s) s = deque() a = deque() r = deque() d = deque() next_s = deque() if (i > 10000) & (not (i % 50000)): self.save_model(f"{i}-eps-.h5") self.save_model(f"final-{i}-eps-.h5")
def __init__(self): self.env = game.GameState(1, False) self.pixel_input = hasattr(Settings, 'CONV_LAYERS') self.frame_buffer = deque(maxlen=4) self.render = False self.gif = False self.name_gif = 'save_' self.n_gif = {} self.images = []
def q_learning(mode, filename=None): if mode == 'test': TOTAL_OBSERVATION = 1_000 else: TOTAL_OBSERVATION = 3_200 observe = TOTAL_OBSERVATION epsilon = INITIAL_EPSILON # init network network = init_network(observe, epsilon, mode, filename) # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory queue = deque(maxlen=REPLAY_MEMORY) s_t0 = get_init_stack(game_state) t = 0 time0 = time.time() total_loss = 0 while (True): action_index, r_t = 0, 0 a_t = np.zeros([ACTIONS]) action_index = chose_action(network, s_t0, a_t, t, epsilon) a_t[action_index] = 1 # We reduced the epsilon gradually if epsilon > FINAL_EPSILON and t > observe: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / TOTAL_EXPLORE s_t1, r_t, terminal = get_next_stack(game_state, a_t, s_t0) queue.append((s_t0, action_index, r_t, s_t1, terminal)) if t > observe: # only train if done observing loss, q_sa = train_network(queue, network) else: loss, q_sa = 0, 0 total_loss += loss s_t0, t = s_t1, t + 1 logging(mode, t, time0, network, observe, epsilon, action_index, r_t, q_sa, loss, total_loss, TOTAL_EXPLORE) print("Episode finished!") print("************************")
def main(train=False, eval=False): game_state = game.GameState() bot = gamebot(get_model(), train) next_state, reward, terminal = game_state.frame_step(bot.NOTHING) next_state = bot.image_preprocessing(next_state) state = np.stack((next_state, next_state, next_state, next_state), axis=2) state = np.reshape(state, (1, *state.shape)) if eval: results = [] local_count = 0 while True: action, action_index = bot.make_action(state) next_state, reward, terminal = game_state.frame_step(action) next_state = bot.image_preprocessing(next_state) next_state = next_state.reshape(1, *next_state.shape, 1) next_state = np.append(next_state, state[:, :, :, :3], axis=3) if train: bot.make_buffer(state, action_index, reward, next_state, terminal) train_index, loss = bot.make_train() print('Epoch: {} - loss: {}'.format(train_index, loss)) if train_index == bot.EXPLORE: return if eval: if reward == 1: local_count += 1 if reward == -1: results.append(local_count) print('{}: {} steps'.format(len(results), local_count)) if len(results) == 100: print('Min: {}'.format(np.min(results))) print('Mean: {}'.format(np.mean(results))) print('Max: {}'.format(np.max(results))) return local_count = 0 state = next_state
def get_game_state(): game_state = game.GameState() a_file = open('logs_' + GAME + "/readout.txt", 'w') h_file = open('logs_' + GAME + "/hidden.txt", 'w') # 初始化 # 将图像转化为80*80*4 的矩阵 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) # 将图像转换成80*80,并进行灰度化 x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_RGBA2GRAY) # 对图像进行二值化 ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) # 将图像处理成4通道 s_current = np.stack((x_t, x_t, x_t, x_t), axis=2) return s_current, game_state
def playgame(): dqn = Dqn() flappy_bird = game_interface.GameState() initial_action = np.array([1, 0]) initial_frame, reward, terminal = flappy_bird.frame_step(initial_action) # initial initial_frame = cv2.cvtColor(cv2.resize(initial_frame, (80, 80)), cv2.COLOR_BGR2GRAY) ret, initial_frame = cv2.threshold(initial_frame,1,255,cv2.THRESH_BINARY) dqn.set_initial_state(initial_frame) while True: action = dqn.get_action() frame, reward, terminal = flappy_bird .frame_step(action) sample = preprocess(frame) dqn.save_transition(sample, action, reward, terminal)
def dummy_play(): def random_action(): action = np.zeros(2) action[np.random.randint(2)] = 1 return action # dummy play using random action to see what happens game_state = game.GameState() action_t = np.zeros(2) action_t[0] = 1 while True: a = random_action() print('get random action: ', a) frame, r, dead = game_state.frame_step(a) # original frame image shape is (288, 512, 3) => (72, 128, 1) resize and gray scale print(f'frame: {frame.shape}, reward: {r}, dead: {dead}') if dead: print('game over.') break
def playFlappyBird(): action = 2 brain = DeepQNetworks(action) flappyBird = game.GameState() action0 = np.array([1, 0]) observation0, reward0, terminal = flappyBird.frame_step(action0) observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)), cv2.COLOR_BGR2GRAY) ret, observation0 = cv2.threshold(observation0, 1, 1, cv2.THRESH_BINARY) brain.setInitState(observation0) while True: action = brain.getAction() score = flappyBird.score next_observation, reward, terminal = flappyBird.frame_step(action) next_observation = preprocess(next_observation) brain.setPerception(next_observation, action, reward, terminal) if terminal: brain.log_score(score)
def playFlappyBird(): # Step 1: init BrainDQN actions = 2 brain = BrainDQN(actions) # Step 2: init Flappy Bird Game flappyBird = game.GameState() # Step 3: play game # Step 3.1: obtain init state action0 = np.array([1, 0]) # do nothing observation0, reward0, terminal = flappyBird.frame_step(action0) observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)), cv2.COLOR_BGR2GRAY) ret, observation0 = cv2.threshold(observation0, 1, 255, cv2.THRESH_BINARY) brain.setInitState(observation0) # Step 3.2: run the game while 1 != 0: action = brain.getAction() nextObservation, reward, terminal = flappyBird.frame_step(action) nextObservation = preprocess(nextObservation) brain.setPerception(nextObservation, action, reward, terminal)
def init_or_restore_training_obj(savefile, x, sess, prediction): if os.path.exists(savefile): print("restore the game from savefile") save_obj = load_from_pickle(savefile) game_state = save_obj[0] replay = save_obj[1] curr_state = save_obj[2] esplion = save_obj[3] print(len(replay)) else: print("init the game") esplion = config.ESPLION game_state = wrapped_flappy_bird.GameState() curr_state, replay = observation(game_state, esplion, x, sess, prediction, step=config.OBSERVATION_STEP) return game_state, replay, curr_state, esplion
def playFlappyBird(): flappyBird = game.GameState() action0 = np.array([1, 0]) # do nothing observation0, reward0, terminal = flappyBird.frame_step(action0) observation0 = preprocess(observation0, shape=(80, 80)) try: with open('replayMemory.pkl', 'rb') as f: brain = pickle.load(f) print('load saved brain') except FileNotFoundError: print('cannot find saved brain, create a new brain') brain = Brain() brain.setInitState(observation0) while True: action = brain.getAction() nextObservation, reward, terminal = flappyBird.frame_step(action) nextObservation = preprocess(nextObservation) brain.setPerception(nextObservation, action, reward, terminal)
def __init__(self): self.flappyBird = wrapped_flappy_bird.GameState() self.experience_pool = [] # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 # one_hot: 0 obser, reward, done = self.flappyBird.frame_step(do_nothing) obser = cv2.cvtColor(cv2.resize(obser, (80, 80)), cv2.COLOR_BGR2GRAY) ret, obser = cv2.threshold(obser, 1, 255, cv2.THRESH_BINARY) observation = np.stack((obser, obser, obser, obser), axis=2) # shape(80, 80, 4) # plt.ion() for i in range(MAX_MEMERY): if i % 5 == 0: index = np.random.randint(0, 2) action = np.zeros([2]) action[index] = 1 else: action = np.array([1, 0]) next_obser, reward, done = self.flappyBird.frame_step(action) next_obser = self.preprocess(next_obser) next_observation = np.append(observation[:, :, 1:], next_obser, axis=2) # plt.clf() # plt.subplot(221) # plt.imshow(next_observation[:, :, 0]) # plt.subplot(222) # plt.imshow(next_observation[:, :, 1]) # plt.subplot(223) # plt.imshow(next_observation[:, :, 2]) # plt.subplot(224) # plt.imshow(next_observation[:, :, 3]) # plt.pause(0.01) self.experience_pool.append( [observation, reward, action, next_observation, done]) observation = next_observation
def explore(self, act_police, episode=100): """增加explore的目的是 使用一个人为策略,收集一些靠谱的数据""" tqdm_e = tqdm(range(episode)) env = game.GameState() print("explore") for i in tqdm_e: done = 0 state = env.reset() act_police.reset() # state = np.squeeze(im_processor(state)) state_stack = np.stack([state for i in range(STACK_NUM)], axis=2) s = deque() a = deque() r = deque() d = deque() next_s = deque() while not done: action = act_police.step() state_newaxis = state_stack[np.newaxis, :] action_array = np.array([0, 0]) action_array[action] = 1 next_im, reward, done = env.step(action_array) # next_im = im_processor(next_im) next_state_stack = np.append(next_im, state_stack[..., :-1], axis=2) action_onehot = to_categorical(action, self.n_action) s.append(state_stack) a.append(action_onehot) r.append(reward) d.append(done) next_s.append(next_state_stack) self.update(s, r, d, a, next_s)
def main(): begin_time = datetime.datetime.now() env = game.GameState() brain = DeepQNetwork(n_actions=N_ACTIONS, memory_size=MEMORY_SIZE, minibatch_size=MINIBATCH_SIZE, gamma=GAMMA, epsilon=INITIAL_EPSILON) step = 0 for episode in range(MAX_EPISODE): # do nothing observation, _, _ = env.frame_step([1, 0]) observation = preprocess(observation, False) brain.reset(observation) while True: action = brain.choose_action(observation) observation_, reward, done = env.frame_step(action) observation_ = preprocess(observation_, True) brain.store_transition(observation, action, reward, done, observation_) # 有一定的记忆就可以开始学习了 if step > 200: brain.learn() if done: break observation = observation_ step += 1 end_time = datetime.datetime.now() print("episode {} over. exec time:{} step:{}".format( episode, end_time - begin_time, step)) env.exit("game over")
def trainNetwork(model, args): # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory D = deque() # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = skimage.color.rgb2gray(x_t) x_t = skimage.transform.resize(x_t, (80, 80)) x_t = skimage.exposure.rescale_intensity(x_t, out_range=(0, 255)) x_t = x_t / 255.0 s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # print (s_t.shape) s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) # 1*80*80*4 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) model.restore(sess) if args['mode'] == 'Run': OBSERVE = 999999999 # We keep observe, never train epsilon = 0 else: # We go to training mode OBSERVE = OBSERVATION epsilon = INITIAL_EPSILON t = 0 while (True): episode_length = 0 episode_reward = 0 for iter_i in itertools.count(): loss = 0 Q_sa = 0 action_index = 0 r_t = 0 a_t = np.zeros([ACTIONS]) # choose an action epsilon greedy if t % FRAME_PER_ACTION == 0: if random.random() <= epsilon: # print("----------Random Action----------") action_index = random.randrange(ACTIONS) a_t[action_index] = 1 else: q = model.predict(sess, s_t) # input a stack of 4 images, get the prediction max_Q = np.argmax(q) action_index = max_Q a_t[max_Q] = 1 # We reduced the epsilon gradually if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # run the selected action and observed next state and reward x_t1_colored, r_t, terminal = game_state.frame_step(a_t) episode_length += 1 episode_reward += r_t x_t1 = skimage.color.rgb2gray(x_t1_colored) x_t1 = skimage.transform.resize(x_t1, (80, 80)) x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255)) x_t1 = x_t1 / 255.0 x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1) # 1x80x80x1 s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3) # store the transition in D D.append((s_t, action_index, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # Now we do the experience replay state_t, action_t, reward_t, state_t1, terminal_batch = zip(*minibatch) state_t = np.concatenate(state_t) state_t1 = np.concatenate(state_t1) targets = model.predict(sess, state_t) Q_sa = model.predict(sess, state_t1) targets[range(BATCH), action_t] = reward_t + GAMMA * np.max(Q_sa, axis=1) * np.invert( terminal_batch) loss += model.update(sess, state_t, targets) # save progress every 10000 iterations if t % 1000 == 0: print("Now we save model") model.save(sess) s_t = s_t1 t = t + 1 # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" if t % 100 == 0: print("TIMESTEP", t, "/ EPISODE_LENGTH", episode_length, "/ STATE", state, \ "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \ "/ Q_MAX ", np.max(Q_sa), "/ Loss ", loss) if terminal: break # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=episode_reward, node_name="episode_reward", tag="episode_reward") episode_summary.value.add(simple_value=episode_length, node_name="episode_length", tag="episode_length") model.train_writer.add_summary(episode_summary, sess.run(tf.train.get_global_step())) model.train_writer.flush() model.train_writer.close() model.validation_writer.close() print("Episode finished!") print("************************")
def trainNetwork(s, readout, h_fc1, sess): # define the cost function a = tf.placeholder("float", [None, ACTIONS]) y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1) cost = tf.reduce_mean(tf.square(y - readout_action)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory D = deque() # printing a_file = open(os.path.normpath(os.path.join(os.path.dirname( os.path.abspath(__file__)), "logs_" + GAME + "/readout.txt")), 'w') h_file = open(os.path.normpath(os.path.join(os.path.dirname( os.path.abspath(__file__)), "logs_" + GAME + "/hidden.txt")), 'w') # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # saving and loading networks saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") # start training epsilon = INITIAL_EPSILON t = 0 while "flappy bird" != "angry bird": # choose an action epsilon greedily readout_t = readout.eval(feed_dict={s : [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = 0 if t % FRAME_PER_ACTION == 0: if random.random() <= epsilon: print("----------Random Action----------") action_index = random.randrange(ACTIONS) a_t[random.randrange(ACTIONS)] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 else: a_t[0] = 1 # do nothing # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # run the selected action and observe next state and reward x_t1_colored, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) #s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2) s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch}) for i in range(0, len(minibatch)): terminal = minibatch[i][4] # if terminal, only equals reward if terminal: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step train_step.run(feed_dict = { y : y_batch, a : a_batch, s : s_j_batch} ) # update the old values s_t = s_t1 t += 1 # save progress every 10000 iterations if t % 10000 == 0: saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print("TIMESTEP", t, "/ STATE", state, \ "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \ "/ Q_MAX %e" % np.max(readout_t)) # write info to files '''
def train(): game = flappy.GameState() game.frame_step()
def trainNetwork(s, out, sess, istrain): # 定义损失函数 x = tf.placeholder(float, [None, ACTIONS]) y = tf.placeholder(float, [None]) # tf.reduce_sum 计算一个张量的各个维度上元素的总和 # api: reduce_sum(input_tensor , axis = None , keep_dims = False , name = None , reduction_indices = None) out_action = tf.reduce_sum(tf.multiply(out, x), reduction_indices=1) # Q估计 # 计算实际和预测结果的均方误差 loss = tf.reduce_mean(tf.square(y - out_action)) # Q现实-Q估计 # 定义反向传播方法 # 学习率:决定参数每次更新的幅度 1e-6 train_step = tf.train.AdamOptimizer(1e-6).minimize(loss) # Adam优化器 # 初始化游戏环节 game_state = game.GameState() # 定义双向队列保存每轮的训练数据 # 将每一轮观测存在D中,之后训练从D中随机抽取batch个数据训练,以打破时间连续导致的相关性,保证神经网络训练所需的随机性 D = deque() # 初始化状态并且预处理图片,把连续的4帧图像作为一个输入 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 # 初始化小鸟动作为不拍动翅膀 # 将初始状态输入到游戏中 获取相应的反馈: 游戏图像 x_t,动作的奖励 r_0,游戏是否结束的标志 terminal x_t, r_0, terminal = game_state.frame_step(do_nothing) # 通过cv2模块的resize,cvtColor,threshold 将游戏图片转换为80*80的二值黑白图片 x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) # threshold:固定阈值二值化 # 图像的二值化就是将图像上的像素点的灰度值设置为0或255,这样将使整个图像呈现出明显的黑白效果 # 图像的二值化使图像中数据量大为减少,从而能凸显出目标的轮廓 ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) # 将连续4帧的图片作为神经网络的输入 # np.stack函数是一个用于numpy数组堆叠的函数 s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # 增加一维,新维度的下标为2 理解为将4张图片堆叠起来 维度变成三维 # 加载保存的网络参数 saver = tf.train.Saver() # 实例化Saver对象 sess.run(tf.initialize_all_variables()) checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") # 开始训练 # 初始化贪婪策略值epsilon epsilon = EPSILON t = 0 # 初始化时间戳 while True: # 根据输入的s_t选择一个动作a_t out_t = out.eval(feed_dict={s: [s_t]})[0] # 将s_t--初始输入作为参数喂入神经网络 a_t = np.zeros([ACTIONS]) # 选择的动作 action_index = 0 # 每隔FRAME_PER_ACTION小鸟选择一次动作 if t % FRAME_PER_ACTION == 0: # 贪心策略 ,有epsilon的几率随机选择动作去探索,否则选取Q值最大的动作 if random.random() <= epsilon: # epsilon几率下随机选择动作执行 print("----------Random Action----------") # 随机选择 action_index = random.randrange(ACTIONS) a_t[random.randrange(ACTIONS)] = 1 else: # 否则选取Q值最大的执行 action_index = np.argmax(out_t) # 返回最大值所在的索引号 a_t[action_index] = 1 else: a_t[0] = 1 # do nothing # 将选择的动作输入到游戏中,获取下一步游戏图像x_t1_colored,奖励r_t和结果terminal x_t1_colored, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2) # 把这次行为的观测值(输入的图像s_t,执行的动作a_t,得到的奖励r_t,得到的图像s_t1和结果terminal存入队列D中 D.append((s_t, a_t, r_t, s_t1, terminal)) # 如果D满了则替换最早的数据 if len(D) > REPLAY_MEMORY: D.popleft() # 若训练轮数超过观察轮数且istrain=true(允许训练),开始对数据进行训练 if t > OBSERVE and istrain: # 随机抽取minibatch个数据进行训练 # 从存储器中随机抽取BATCH组数据 minibatch = random.sample(D, BATCH) # 获取BATCH个变量 s_j_batch = [d[0] for d in minibatch] # 图像 a_batch = [d[1] for d in minibatch] # 动作 r_batch = [d[2] for d in minibatch] # 奖励 s_j1_batch = [d[3] for d in minibatch] # 得到的图像 # 估计奖励 y_batch = [] out_j1_batch = out.eval(feed_dict={s: s_j1_batch}) for i in range(0, len(minibatch)): terminal = minibatch[i][4] # 若terminal=true 则游戏结束,奖励值为r_batch[i] # 若terminal=false 则游戏继续,奖励值为r_batch[i]加上GAMMA*最大Q值 Q值推导式 if terminal: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(out_j1_batch[i])) # 将估计奖励y_batch,动作a_batch和图像s_j_batch传入train_step进行训练 # sess.run(train_step, feed_dict={y: y_batch, x: a_batch, s: s_j_batch}) train_step.run(feed_dict={ y: y_batch, # 估计奖励 x: a_batch, # 动作 s: s_j_batch }) # 更新状态 s_t = s_t1 t += 1 # 每1000轮保存一次网络数据 if t % 1000 == 0: saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step=t) # 打印信息 # 训练轮数、执行的奖励r_t、最大Q值 print("TIMESTEP ", t, "| ACTION ", ACTION_NAME[action_index], " | REWARD ", r_t, " | Q_MAX %e" % np.max(out_t))