class Agent: def __init__(self, env, config, wt): self.C = config self.n_state = list(env.observation_space.shape) self.n_action = env.action_space.n self.epsilon = 0.99 self.lr = 1e-3 self.wt = wt self.buffer = ReplayBuffer(self.C['max_size'], self.C['frame_stack']) self.buffer2 = ReplayBuffer(self.C['max_size'], self.C['frame_stack']) self.net = Net(self.n_state, self.n_action, self.C, self.wt) #Random action during Practice def act_pre(self): a = np.random.randint(self.n_action) return a #Epsilon greedy action selection function def act(self, s): a = self.greedy_act( s) if np.random.random() > self.epsilon else np.random.randint( self.n_action) return a def greedy_act(self, s): return self.net.action(s) #Practice without recording experiences def practice(self): self.lr = 1e-3 #possible self.net.pre_train(self.buffer, self.lr) #Records experiences and calls training functions def record(self, s, a, r, d, it, pre): #Variable pre is used to differentiate practice from RL training. if pre: self.buffer.append(s, a, r, d) if it > self.C['pre_training_start']: if it % self.C['pre_train_freq'] == 0: self.lr = 1e-3 self.net.pre_train(self.buffer, self.lr) else: self.buffer.append(s, a, r, d) if it <= 5e5: self.epsilon = linear_interp(0, 5e5, it, 0.1, 1.0) else: self.epsilon = max(linear_interp(5e5, 10e6, it, 0.01, 0.1), 0.01) if it > self.C['training_start']: if it % self.C['train_freq'] == 0: self.lr = 1e-4 #Learning rate for RL training self.net.train(self.buffer, self.lr) if it % self.C['update_target_freq'] == 0: self.net.update_target_network()
class Agent: def __init__(self, env, config, wt): self.C = config self.n_state = list(env.observation_space.shape) self.n_action = env.action_space.n self.epsilon = 0.99 self.lr = 1e-3 self.wt = wt self.buffer = ReplayBuffer(self.C['max_size'], self.C['frame_stack']) self.buffer2 = ReplayBuffer(self.C['max_size'], self.C['frame_stack']) self.net = Net(self.n_state, self.n_action, self.C, self.wt) def act_pre(self): a = np.random.randint(self.n_action) return a def act(self, s): a = self.greedy_act( s) if np.random.random() > self.epsilon else np.random.randint( self.n_action) return a def greedy_act(self, s): return self.net.action(s) def record(self, s, a, r, d, it, pre): if pre: self.buffer.append(s, a, r, d) if it > self.C['pre_training_start']: if it % self.C['pre_train_freq'] == 0: self.lr = 1e-3 #possible self.net.pre_train(self.buffer, self.lr) else: self.buffer.append(s, a, r, d) if it <= 6e5: self.epsilon = linear_interp(0, 6e5, it, 0.1, 1.0) else: self.epsilon = max(linear_interp(6e5, 10e6, it, 0.01, 0.1), 0.01) if it > self.C['training_start']: if it % self.C['train_freq'] == 0: self.lr = 1e-4 self.net.train(self.buffer, self.lr) # print(Q) if it % self.C['update_target_freq'] == 0: self.net.update_target_network()
#get replay tuple n_state = getState(game) n_actions = [] acts = [] for el in processActions(game, state): n_actions.append(el[2]) acts.append(el[0]) if action == 'E': game.playerUpdate() if action == 'S' or action == 'C': r = game.board.vertices[spec[0]][spec[1]].score else: r = rewards[action] rBuffer.append((state, r + win * winnings, n_actions, acts)) rBuffer.pop() if count > wait: states = [] targets = [] for s_0, r, s_1, a in rBuffer.sample(batch_size): states.append(s_0) q_t = np.max(Qtarget.evaluate(s_1)) targets.append(r + gamma * q_t) loss += Qprincipal.train(states, targets).history['loss'][0] softUpdate(Qprincipal, Qtarget, alpha)
def train(self): """Train.""" logs_path = self.args.logs_path video_path = self.args.video_path restore = self.args.restore train = self.args.train # Initial PLE environment os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" # Design reward reward_values = { "positive": 1, "tick": 0.1, "loss": -1, } # Create FlappyBird game env env = PLE(FlappyBird(), display_screen=False, reward_values=reward_values) # Gets the actions FlappyBird supports action_set = env.getActionSet() replay_buffer = ReplayBuffer(self.hparams.replay_buffer_size) agent = Agent(action_set, self.hparams) # restore model if restore: agent.restore(restore) reward_logs = [] loss_logs = [] for episode in range(1, self.hparams.total_episode + 1): # reset env env.reset_game() env.act(0) obs = convert(env.getScreenGrayscale()) state = np.stack([[obs for _ in range(4)]], axis=0) t_alive = 0 total_reward = 0 if episode % self.hparams.save_video_frequency == 0 and episode > self.hparams.initial_observe_episode: agent.stop_epsilon() frames = [env.getScreenRGB()] while not env.game_over(): action = agent.take_action(state) reward = env.act(action_set[action]) if episode % self.hparams.save_video_frequency == 0 and episode > self.hparams.initial_observe_episode: frames.append(env.getScreenRGB()) obs = convert(env.getScreenGrayscale()) obs = np.reshape(obs, [1, 1, obs.shape[0], obs.shape[1]]) state_new = np.append(state[:, 1:, ...], obs, axis=1) action_onehot = np.zeros(len(action_set)) action_onehot[action] = 1 t_alive += 1 total_reward += reward replay_buffer.append( (state, action_onehot, reward, state_new, env.game_over())) state = state_new # save video if episode % self.hparams.save_video_frequency == 0 and episode > self.hparams.initial_observe_episode: os.makedirs(video_path, exist_ok=True) clip = make_video(frames, fps=60).rotate(-90) clip.write_videofile(os.path.join( video_path, 'env_{}.mp4'.format(episode)), fps=60) agent.restore_epsilon() print('Episode: {} t: {} Reward: {:.3f}'.format( episode, t_alive, total_reward)) # danger mp4list = glob.glob('./video_XXX/*.mp4') if len(mp4list) > 0: latest = mp4list[0] latest_timestamp = os.path.getmtime(mp4list[0]) for mp4 in mp4list: ts = os.path.getmtime(mp4) if (ts > latest_timestamp): latest_timestamp = ts latest = mp4 video = io.open(latest, 'r+b').read() encoded = base64.b64encode(video) ipythondisplay.display( HTML(data='''<video alt="test" autoplay loop controls style="height: 400px;"> <source src="data:video/mp4;base64,{0}" type="video/mp4" /> </video>'''.format(encoded.decode('ascii')))) #end danger else: print("Could not find video") if episode > self.hparams.initial_observe_episode and train: # save model if episode % self.hparams.save_logs_frequency == 0: agent.save(episode, logs_path) np.save(os.path.join(logs_path, 'loss.npy'), np.array(loss_logs)) np.save(os.path.join(logs_path, 'reward.npy'), np.array(reward_logs)) # update target network if episode % self.hparams.update_target_frequency == 0: agent.update_target_network() # sample batch from replay buffer batch_state, batch_action, batch_reward, batch_state_new, batch_over = replay_buffer.sample( self.hparams.batch_size) # update policy network loss = agent.update_Q_network(batch_state, batch_action, batch_reward, batch_state_new, batch_over) loss_logs.extend([[episode, loss]]) reward_logs.extend([[episode, total_reward]]) # print reward and loss if episode % self.hparams.show_loss_frequency == 0: print( 'Episode: {} t: {} Reward: {:.3f} Loss: {:.3f}'.format( episode, t_alive, total_reward, loss)) agent.update_epsilon()