def play(self): env = Env() if self.render: env.init_render() scores = 0 steps = 0 episodes = 10 for e in range(episodes): step = 0 done = False observe, _, _, _ = env.reset() state = preprocess(observe).reshape((1, RESIZE, RESIZE)) state = np.float32(state / 255.) history = np.copy(state) for _ in range(self.seq_size - 1): history = np.append(history, state, axis=0) state = np.copy(state) history = np.reshape([history], (1, self.seq_size, RESIZE, RESIZE)) while not done: # snap1 = history[0][0] # snap2 = history[0][1] # Image.fromarray(snap1 * 255.).show() # Image.fromarray(snap2 * 255.).show() step += 1 if self.render: env.render() action, policy = self.get_action(history) pmax_action = np.argmax(policy) print(ACTION[action], policy, ACTION[pmax_action]) while True: key = input('Press y or action: ') if key in ['0', '1', '2', '3']: action = int(key) break elif key == 'y': break # if np.random.uniform() > 0.5: # action = pmax_action next_observe, reward, done, info = env.step(action + 1) next_state = preprocess(next_observe) next_state = np.reshape([next_state], (1, 1, RESIZE, RESIZE)) next_state = np.float32(next_state / 255.) next_history = np.append(next_state, history[:, :(self.seq_size - 1), :, :], axis=1) history = next_history steps += step scores += env.game.score step = 0 print('AvgScore: %s AvgStep: %s' % (scores / episodes, steps / episodes)) return scores / episodes, steps / episodes
def play(self, episodes=10, delay=0, improve='policy', debug=False, SNAPSHOT=False): env = Env() scores = 0 steps = 0 print('Value\tRandom\tGreedy\tPolicy') for e in range(episodes): step = 0 done = False observe, _, _, _ = env.reset() state = preprocess(observe).reshape((1, RESIZE, RESIZE, 1)) state = np.float32(state / 255.) history = np.stack([state] * self.seq_size, axis=1) while not done: time.sleep(delay) step += 1 if self.render: env.render() if SNAPSHOT: snapshot = np.array([]).reshape([0,RESIZE]) for snap in history[0]: snapshot = np.append(snapshot, snap, axis=0) Image.fromarray(snapshot*255.).show() action, policy = self.get_action(history) if improve == 'greedy': real_action = int(np.argmax(policy)) elif improve == 'e-greedy': real_action = int(np.argmax(policy)) if np.random.uniform(low=0.0, high=1.0) > 0.1 else action else: real_action = action value = self.critic.predict(history) print(value, '\t', ACTION[action], '\t', ACTION[int(np.argmax(policy))], '\t', policy) if debug: while True: a = input('Press y or action(w(stay), a(left), d(right)):') if a=='y': break elif a=='w': real_action = 0 break elif a=='a': real_action = 1 break elif a=='d': real_action = 2 break next_observe, reward, done, info = env.step(real_action) next_state = preprocess(next_observe).reshape((1, RESIZE, RESIZE, 1)) next_state = np.float32(next_state / 255.) next_history = np.append(history[0][1:], next_state, axis=0) next_history = np.float32([next_history]) history = next_history steps += step scores += env.game.score print('Score: %s Step: %s' % (env.game.score, step)) return scores/episodes, steps/episodes
def run(self): global episode env = Env() while True: # print(self.tid, 'Still Training!') step = 0 self.avg_p_max = 0 self.reward_sum = 0 self.actor_loss = 0 self.critic_loss = 0 done = False observe, _, _, _ = env.reset() state = preprocess(observe).reshape((1, RESIZE, RESIZE, 1)) state = np.float32(state / 255.) history = np.stack([state] * self.seq_size, axis=1) while not done: step += 1 self.t += 1 if self.render: env.render() action, policy = self.get_action(history) real_action = action next_observe, reward, done, info = env.step(real_action) if REWARD_CLIP == 'clip': reward = np.clip(reward, -1.0, 1.0) next_state = preprocess(next_observe).reshape( (1, RESIZE, RESIZE, 1)) next_state = np.float32(next_state / 255.) next_history = np.append(history[0][1:], next_state, axis=0) next_history = np.float32([next_history]) self.avg_p_max += np.amax(policy) self.reward_sum += reward self.append_sample(history, action, reward) history = next_history if self.t >= self.t_max or done: self.t = 0 actor_loss, critic_loss = self.upload_sample( next_history, done) self.actor_loss += abs(actor_loss[0]) self.critic_loss += abs(critic_loss[0]) episode += 1 avg_p_max = self.avg_p_max / float(step) train_num = step // self.t_max + 1 avg_actor_loss = self.actor_loss / train_num avg_critic_loss = self.critic_loss / train_num stats = [ episode, step, self.reward_sum, env.game.score, avg_p_max, avg_actor_loss, avg_critic_loss, info ] self.stats.append(stats)
def run(self): global episode env = Env() while True: # print(self.tid, 'Still Training!') step = 0 avg_p_max = 0 reward_sum = 0 actor_loss = 0 critic_loss = 0 done = False observe, _, _, _ = env.reset() state = preprocess(observe) history = np.stack([state] * self.seq_size, axis=1) while not done: step += 1 self.t += 1 if self.render: env.render() action, policy = self.get_action(history) real_action = action next_observe, reward, done, info = env.step(real_action) if self.reward_clip: reward = np.clip(reward, -1.0, 1.0) next_state = preprocess(next_observe).reshape([1] + self.state_size) next_history = np.append(history[:, 1:], next_state, axis=1) avg_p_max += np.amax(policy) reward_sum += reward self.append_sample(history, action, reward) history = next_history if self.t >= self.t_max or done: self.t = 0 actor_loss, critic_loss = self.train_model(next_history, done) actor_loss += actor_loss critic_loss += critic_loss self.update_local_model() episode += 1 avg_p_max = avg_p_max / float(step) train_num = step // self.t_max + 1 avg_actor_loss = actor_loss / train_num avg_critic_loss = critic_loss / train_num stats = [ episode, step, reward_sum, env.game.score, avg_p_max, avg_actor_loss, avg_critic_loss, info ] with self.lock: self.stats.append(stats)
def run(self): global episode env = Env() while True: # print(self.tid, 'Still Training!') step = 0 avg_p_max = 0 reward_sum = 0 actor_loss = 0 critic_loss = 0 done = False observe, _, _, _ = env.reset() state = preprocess(observe).reshape(self.state_shape) while not done: step += 1 self.t += 1 if self.render: env.render() action, policy = self.get_action(state) real_action = action next_observe, reward, done, info = env.step(real_action) if self.reward_clip: reward = np.clip(reward, -1.0, 1.0) next_state = preprocess(next_observe).reshape(self.state_shape) avg_p_max += np.amax(policy) reward_sum += reward self.append_sample(state, action, reward) state = next_state if self.t >= self.t_max or done: self.t = 0 # if timeout, get returns with next pred value mask = False if done and info == 'timeout' else done actor_loss, critic_loss = self.train_model(next_state, mask) actor_loss += actor_loss critic_loss += critic_loss self.update_local_model() episode += 1 avg_p_max = avg_p_max / float(step) train_num = step // self.t_max + 1 avg_actor_loss = actor_loss / train_num avg_critic_loss = critic_loss / train_num stats = [ episode, step, reward_sum, env.game.score, avg_p_max, avg_actor_loss, avg_critic_loss, info ] with self.lock: self.stats.append(stats)
return action_index, policy if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--render', action='store_true') parser.add_argument('--delay', type=float, default=0.) parser.add_argument('--episode', type=int, default=1) parser.add_argument('--verbose', action='store_true') parser.add_argument('--debug', action='store_true') parser.add_argument('--load_model', action='store_true') parser.add_argument('--seqsize', type=int, default=2) args = parser.parse_args() keymap = {'w': 0, 'a': 1, 'd': 2} env = Env() agent = Agent(state_size=env.state_size, action_size=env.action_size, seq_size=args.seqsize, load_model=args.load_model, verbose=args.verbose, render=args.render) np.set_printoptions(precision=4, suppress=True) for e in range(args.episode): step = 0 reward_sum = 0 done = False observe, _, _, _ = env.reset() state = observe.reshape(agent.state_shape) / 20.
def train(self): global episode highscore = 0 if os.path.exists('ppo_highscore.csv'): with open('ppo_highscore.csv', 'r') as f: read = csv.reader(f) highscore = float(next(reversed(list(read)))[1]) print('Highscore: %.3f' % highscore) env = Env() self.states = np.zeros([0] + self.state_size) self.actions = np.zeros([0] + [self.action_size]) self.rewards, self.old_pi, self.advantages, self.returns = [], [], [], [] actor_loss = 0 critic_loss = 0 t = 0 self.t = 0 while True: done = False step = 0 reward_sum = 0 pmax = 0 timeout = 0 observe, _, _, _ = env.reset() state = observe.reshape(self.state_shape) / 20. while not done and timeout < self.timeout: if self.render: env.render() action, policy = self.get_action(state) if self.debug: print(state, policy) while True: a = input('Press 0 1 2: ') if a in ['0', '1', '2']: action = int(a) break next_observe, reward, done, info = env.step(action) next_state = next_observe.reshape(self.state_shape) / 20. self.append_sample(state, action, reward, policy[action]) timeout = 0 if info == 'goal' else timeout + 1 step += 1 t += 1 pmax += np.amax(policy) reward_sum += reward state = next_state if not done: info = 'timeout' self.get_gae(next_state, done) self.t = t episode += 1 avg_pmax = pmax / float(step) stats = [ episode, step, reward_sum, env.game.score, avg_pmax, actor_loss, critic_loss, info ] self.stats.append(stats) if t >= self.horizon: actor_loss, critic_loss = self.train_model() t = 0 self.t = 0 # actor_loss += a_loss # critic_loss += c_loss if len(self.stats) >= self.save_rate: with open('ppo_output.csv', 'a', encoding='utf-8', newline='') as f: wr = csv.writer(f) for row in self.stats: wr.writerow(row) self.save_model('./save_model/ppo') mean = np.mean(np.float32(np.split(self.stats, [-1], axis=1)[0]), axis=0) if mean[3] > highscore: highscore = mean[3] with open('ppo_highscore.csv', 'a', encoding='utf-8', newline='') as f: wr = csv.writer(f) wr.writerow([episode, highscore, dt.now().strftime('%Y-%m-%d %H:%M:%S')]) self.save_model('./save_model/ppo_high') print('%s: %s Episodes Trained! Reward:%.3f Score:%.3f Step:%.3f Pmax:%.3f' % (dt.now().strftime('%Y-%m-%d %H:%M:%S'), len(self.stats), mean[2], mean[3], mean[1], mean[4])) self.stats.clear()
parser.add_argument('--gamma', type=float, default=0.99, help='Discount factor') parser.add_argument('--lambd', type=float, default=0.95, help='TD(lambda). The bigger lambda, The bigger weight on future reward') parser.add_argument('--batch_size', type=int, default=16, help='Mini-batch size') parser.add_argument('--horizon', type=int, default=256, help='Time horizon') parser.add_argument('--seqsize', type=int, default=1, help='Length of sequence') parser.add_argument('--epoch', type=int, default=3, help='Update epochs') parser.add_argument('--clip', type=float, default=0.2, help='Clip ratio') parser.add_argument('--timeout', type=int, default=400, help='After this step, timeout') parser.add_argument('--reward_clip',action='store_true', help='Reward will be clipped in [-1, 1]') parser.add_argument('--render', action='store_true', help='First agent render') parser.add_argument('--load_model', action='store_true', help='Load model in ./save_model/') parser.add_argument('--verbose', action='store_true', help='Print summary of model of global network') parser.add_argument('--debug', action='store_true') args = parser.parse_args() env = Env() global_agent = PPOAgent( state_size=env.state_size, action_size=env.action_size, seq_size=args.seqsize, gamma=args.gamma, lambd=args.lambd, entropy=args.entropy, horizon=args.horizon, actor_lr=args.lr, critic_lr=args.lr, batch_size=args.batch_size, epoch=args.epoch, clip=args.clip, thread_num=args.threads, load_model=args.load_model,