def __init__(self, args): # log directly to file. Not to stdout self.logfile = os.path.join(args.rl_search_save_dir, "log") print("Log to %s" % self.logfile) self.log_buffer = [] # assert args.rl_search_save_interval % args.rl_search_reward_interval == 0 assert args.rl_search_learn_interval % args.rl_search_reward_interval == 0 assert args.rl_search_learn_interval % args.rl_search_save_interval == 0 # build env, agent self.env = SimMTEnvironment.build_env(args) self.agent = REINFORCE(self.env, args).to(args.rl_search_device) self.args = args self.step = 0 self.best_student_metric = math.inf self.env.reset() self.agent.reset() # load checkpoint ckpt_last = os.path.join(args.rl_search_save_dir, "checkpoint_last.txt") if os.path.exists(ckpt_last): self.step = self.read_ckpt_info(ckpt_last) self.load(self.step)
def main(args): # preprocess input state def preprocess(obser): '''preprocess 210x160x3 frame into 6400(80x80) flat vector''' obser = obser[35:195] # 160x160x3 obser = obser[::2, ::2, 0] # downsample (80x80) obser[obser == 144] = 0 obser[obser == 109] = 0 obser[obser != 0] = 1 return obser.astype(np.float).ravel() INPUT_DIM = 80 * 80 HIDDEN_UNITS = 200 ACTION_DIM = 6 MAX_EPISODES = 20000 MAX_STEPS = 5000 # load agent agent = REINFORCE(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) else: # build a new model agent.init_var() # load env env = gym.make("Pong-v0") # evaluation for ep in xrange(args.ep): # reset env total_rewards = 0 state = env.reset() while True: env.render() # preprocess state = preprocess(state) # sample actions action = agent.sample_action(state[np.newaxis, :]) # act! next_state, reward, done, _ = env.step(action) total_rewards += reward # state shift state = next_state if done: break print 'Ep%s Reward: %s ' % (ep + 1, total_rewards)
def main(args): def preprocess(obs): obs = obs[35:195] obs = obs[::2, ::2, 0] obs[obs == 144] = 0 obs[obs == 109] = 0 obs[obs != 0] = 1 return obs.astype(np.float).ravel() INPUT_DIM = 80 * 80 HIDDEN_UNITS = 200 ACTION_DIM = 6 # load agent agent = REINFORCE(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) else: # build a new model agent.init_var() # load env env = gym.make('Pong-v0') # evaluation for ep in range(args.ep): # reset env total_rewards = 0 state = env.reset() while True: env.render() # preprocess state = preprocess(state) # sample actions action = agent.sample_action(state[np.newaxis, :]) # act! next_state, reward, done, _ = env.step(action) total_rewards += reward # state shift state = next_state if done: break print('Ep%s Reward: %s ' % (ep + 1, total_rewards))
def main(args): def preprocess(obs): obs = obs[35:195] obs = obs[::2, ::2, 0] obs[obs == 144] = 0 obs[obs == 109] = 0 obs[obs != 0] = 1 return obs.astype(np.float).ravel() INPUT_DIM = 80 * 80 HIDDEN_UNITS = 200 ACTION_DIM = 6 # load agent agent = REINFORCE(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) else: # build a new model agent.init_var() # load env env = gym.make('Pong-v0') # evaluation for ep in range(args.ep): # reset env total_rewards = 0 state = env.reset() while True: env.render() # preprocess state = preprocess(state) # sample actions action = agent.sample_action(state[np.newaxis, :]) # act! next_state, reward, done, _ = env.step(action) total_rewards += reward # state shift state = next_state if done: break print('Ep%s Reward: %s ' % (ep+1, total_rewards))
def main(args): MODEL_PATH = args.model_path INPUT_DIM = 80 * 80 HIDDEN_UNITS = 200 ACTION_DIM = 6 MAX_EPISODES = 20000 MAX_STEPS = 5000 # load agent agent = REINFORCE(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM) agent.construct_model(args.gpu) # model saver saver = tf.train.Saver(max_to_keep=1) if MODEL_PATH is not None: saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) mean_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: agent.sess.run(tf.global_variables_initializer()) ep_base = 0 mean_rewards = None # load env env = gym.make('Pong-v0') # main loop for ep in range(MAX_EPISODES): # reset env total_rewards = 0 state = env.reset() for step in range(MAX_STEPS): # preprocess state = preprocess(state) # sample actions action = agent.sample_action(state[np.newaxis, :]) # act! next_state, reward, done, _ = env.step(action) total_rewards += reward agent.store_rollout(state, action, reward) # state shift state = next_state if done: break # update model per episode agent.update_model() # logging if mean_rewards is None: mean_rewards = total_rewards else: mean_rewards = 0.99 * mean_rewards + 0.01 * total_rewards rounds = (21 - np.abs(total_rewards)) + 21 average_steps = (step + 1) / rounds print('Ep%s: %d rounds \nAvg_steps: %.2f Reward: %s Avg_reward: %.4f' % (ep + 1, rounds, average_steps, total_rewards, mean_rewards)) if ep % 100 == 0: if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = args.save_path + str(round(mean_rewards, 2)) + '_' \ + str(ep_base+ep+1) saver.save(agent.sess, save_name)
class Trainer: def __init__(self, args): # log directly to file. Not to stdout self.logfile = os.path.join(args.rl_search_save_dir, "log") print("Log to %s" % self.logfile) self.log_buffer = [] # assert args.rl_search_save_interval % args.rl_search_reward_interval == 0 assert args.rl_search_learn_interval % args.rl_search_reward_interval == 0 assert args.rl_search_learn_interval % args.rl_search_save_interval == 0 # build env, agent self.env = SimMTEnvironment.build_env(args) self.agent = REINFORCE(self.env, args).to(args.rl_search_device) self.args = args self.step = 0 self.best_student_metric = math.inf self.env.reset() self.agent.reset() # load checkpoint ckpt_last = os.path.join(args.rl_search_save_dir, "checkpoint_last.txt") if os.path.exists(ckpt_last): self.step = self.read_ckpt_info(ckpt_last) self.load(self.step) def print(self, *args): self.log_buffer.append(args) def clear_print_buffer(self): # write to logfile with open(self.logfile, "a") as f: for args in self.log_buffer: print(*args, file=f) self.log_buffer = [] def checkpoint_name(self, step): return os.path.join(self.args.rl_search_save_dir, "teacher", "checkpoint{:d}.pt".format(step)), \ os.path.join(self.args.rl_search_save_dir, "student", "checkpoint{:d}.pt".format(step)), \ os.path.join(self.args.rl_search_save_dir, "trainer", "checkpoint{:d}.pt".format(step)) @staticmethod def write_ckpt_info(path, step): d = os.path.dirname(path) if not os.path.exists(d): os.makedirs(d) with open(path, "w") as f: f.write("{:d}\n".format(step)) @staticmethod def read_ckpt_info(path): with open(path) as f: s = f.read().strip() return int(s) def save(self, student_metric): self.clear_print_buffer() # pdb.set_trace() teacher, student, trainer = self.checkpoint_name(self.step) self.agent.save(teacher) self.env.save(student) verify_dir(trainer) torch.save(dict(best_student_metric=self.best_student_metric), trainer) self.write_ckpt_info( os.path.join(self.args.rl_search_save_dir, "checkpoint_last.txt"), self.step) if student_metric < self.best_student_metric: print("= New best student! step %d, %r -> %r" % (self.step, self.best_student_metric, student_metric)) self.best_student_metric = student_metric self.write_ckpt_info( os.path.join(self.args.rl_search_save_dir, "best.txt"), self.step) def load(self, step): # pdb.set_trace() print("| Load agent and model at %d step" % step) teacher, student, trainer = self.checkpoint_name(step) # pdb.set_trace() self.agent.load(teacher) self.env.load(student) ckpt = torch.load(trainer) self.best_student_metric = ckpt["best_student_metric"] def train(self): env = self.env agent = self.agent args = self.args # pdb.set_trace() t = tqdm.tqdm() while not env.done(): self.step += 1 state = env.state() action = agent.get_action(state) _, log = env.step(action) self.print("> Environment step logging:\tstep %d\taction: %r\t%s" % (self.step, env.wrap_action(action), log_str(log))) # pdb.set_trace() if self.step % args.rl_search_reward_interval == 0: reward, student_metric, log = env.reward() self.print( "> Environment reward logging:\tstep %d\treward: %.4f, student metric: %.4f\t%s" % (self.step, reward, student_metric, log_str(log))) agent.update_reward(reward) if self.step % args.rl_search_learn_interval == 0: agent.learn() agent.reset() assert self.step % args.rl_search_save_interval == 0 self.save(student_metric) elif self.step % args.rl_search_save_interval == 0: student_metric, log = env.validate() self.print( "> Environment validate logging:\tstep %d\tstudent metric: %.4f\t%s" % (self.step, student_metric, log_str(log))) self.save(student_metric) t.update(1) self.clear_print_buffer() print("Avg time: %s" % format_seconds(t.avg_time)) print("Total: %d steps" % t.n) print("Total time: %s" % format_seconds(t.avg_time * t.n)) t.close()
policy = A2C.A2C(env.observation_space, env.action_space, args.discount, args.tau, max_episode_timesteps) x, y = policy.run(envs, file_name, args) write_result(args.env + "_A2C.json", x, y) elif args.policy == "DDPG": policy = DDPG.DDPG(**kwargs) x, y = policy.run(env, file_name, args) write_result(args.env + "_DDPG.json", x, y) elif args.policy == "REINFORCE": args.n_steps = 5 args.n_processes = 16 envs = ParaEnv(args.env, args.n_processes, args.seed) policy = REINFORCE.REINFORCE(env.observation_space, env.action_space, args.discount, args.tau, args.n_steps, args.n_processes, max_episode_timesteps) x, y = policy.run(envs, file_name, args) write_result(args.env + "_REINFORCE.json", x, y) else: x, y = None, None print(x) print(y) plt.figure() plt.xlabel('Timesteps') plt.ylabel('Reward') plt.plot(x, y) plt.show()