def main(): num_cpu = 1 load_version = '' save_version = '1b_v0' load_dir = '../models' save_dir = '../models' timesteps_per_checkpoint = int(1e6) num_checkpoints = int(1e1) # controlling performance level of agent try: os.mkdir(save_dir) except OSError as error: pass alg_env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) print('created alg env') train_policy = 'MlpPolicy' load_path = '{}/alg_v{}.zip'.format(load_dir, load_version) if os.path.exists(load_path): alg = PPO(train_policy, alg_env, verbose=0) alg.set_parameters(load_path, exact_match=True) # alg = PPO.load(load_path, env=alg_env) print('loaded alg checkpoint' + load_path) else: alg = PPO(train_policy, alg_env, verbose=0) print('created alg model') save_path = '{}/alg_v{}.zip'.format(save_dir, save_version) for _ in range(num_checkpoints): alg.learn(total_timesteps=timesteps_per_checkpoint) alg.save(save_path) print('saved alg checkpoint' + save_path)
class AgentDemoWrapper(gym.Wrapper): def __init__(self, env, agent_path=None, tempdir_path=None): self.alg = PPO('MlpPolicy', env, verbose=0) if agent_path is not None: load_path = agent_path self.alg.set_parameters(load_path, exact_match=True) if tempdir_path is None: tempdir_path = 'temp' try: os.mkdir(tempdir_path) except: pass self.save_dir = tempdir_path self.max_attempt = 1000 super(AgentDemoWrapper, self).__init__(env) def reset(self): obs = self.env.reset() return obs def step(self, action): obs, reward, done, info = self.env.step(action) return obs, reward, done, info def generate_episode_gif(self, init_map): images = [] done = False obs = self.env.manual_reset(init_map) im = room_to_rgb(obs) images.append(im) while not done: action, _ = self.alg.predict(obs, deterministic=True) obs, _, done, _ = self.env.step(action) im = room_to_rgb(obs) images.append(im) im_name = '{}/agent_episode.gif'.format(self.save_dir) imageio.mimsave(im_name, images, 'GIF', fps=2)
unique_solver_idx = -1 success = False while not success: success = True try: fix_room = room_utils.generate_room(dim=dim_room, num_steps=num_gen_steps, num_boxes=num_boxes, second_player=False) _, state, _ = fix_room except: success = False for i in range(len(version_li)): version = version_li[i] load_path = '{}/agent_v{}.zip'.format(load_dir, version) agent.set_parameters(load_path, exact_match=True) # agent = agent_li[i] done = False obs = np.expand_dims(soko_env.env_method('manual_reset', state)[0], axis=0) while not done: action, _ = agent.predict(obs, deterministic=True) obs, _, done, info = soko_env.step(action) # solved if info[0]["all_boxes_on_target"]: num_solved_li[i] += 1 if unique_solver_idx == -1: unique_solver_idx = i else: unique_solver_idx = -1
class ALGDemoWrapper(gym.Wrapper): def __init__(self, env, alg_path=None, alg_version=0, tempdir_path=None): self.alg = PPO('MlpPolicy', env, verbose=0) if alg_path is not None: load_path = alg_path + str(alg_version) self.alg.set_parameters(load_path, exact_match=True) if tempdir_path is None: tempdir_path = 'temp' try: os.mkdir(tempdir_path) except: pass self.save_dir = tempdir_path self.max_attempt = 1000 self.version = alg_version super(ALGDemoWrapper, self).__init__(env) def reset(self): obs = self.env.reset() return obs def step(self, action): obs, reward, done, info = self.env.step(action) return obs, reward, done, info def generate_level(self): while True: done = False obs = self.env.reset() while not done: action, _ = self.alg.predict(obs, deterministic=True) obs, _, done, info = self.env.step(action) if info['fail_type'] == -1: return obs def generate_episode_gif(self): attempt = 0 while True: images = [] done = False obs = self.env.reset() im = room_to_rgb(obs) images.append(im) while not done: action, _ = self.alg.predict(obs, deterministic=True) obs, _, done, info = self.env.step(action) im = room_to_rgb(obs) images.append(im) if info['train_result'] == 0: im_name = '{}/alg_episode_v{}.gif'.format( self.save_dir, self.version) imageio.mimsave(im_name, images, 'GIF', fps=2) return True, obs attempt += 1 if attempt >= self.max_attempt: print('Time out. Wasn\'t able to generate good map.') return False, None
def evaluate_agents(version_li=['1b_0', '1b_1'], num_boxes=1, dim_room=(7, 7), max_steps=20, num_tests=1000, train_mode='mlp', load_dir='../demo_checkpoints', alg_path=None, alg_version=0): num_gen_steps = int(1.7 * (dim_room[0] + dim_room[1])) # env_li = [lambda: SokobanEnv(dim_room=dim_room, max_steps=max_steps, # num_boxes=num_boxes, train_mode=train_mode, log_train_info=False)] soko_env = SokobanEnv(dim_room=dim_room, max_steps=max_steps, num_boxes=num_boxes, train_mode=train_mode, log_train_info=False) print('created soko env') # agent_li = [] if train_mode == 'cnn': train_policy = 'CnnPolicy' else: train_policy = 'MlpPolicy' # for version in version_li: agent = PPO(train_policy, soko_env, verbose=0) # agent = PPO.load('../models/soko_v'+version+'.zip', env=soko_env) # print('loaded', version, 'model') # agent_li.append(agent) if alg_path is not None: alg_env = ALGEnv(dim_room=dim_room, num_boxes=num_boxes, train_mode=train_mode, alg_version=0, agent_lb_path=None, agent_ub_path=None, init_probs=[0.5,0.5,0.5], log_interval=0) load_path = alg_path + str(alg_version) alg_demo = ALGDemoWrapper(alg_env, alg_path=alg_path, alg_version=alg_version) for _ in range(1): num_solved_li = [] num_unique_solved_li = [] for _ in range(len(version_li)): num_solved_li.append(0) num_unique_solved_li.append(0) for _ in tqdm(range(num_tests)): unique_solver_idx = -1 # use random generator if alg_path is None: while True: success = True try: fix_room = room_utils.generate_room( dim=dim_room, num_steps=num_gen_steps, num_boxes=num_boxes, second_player=False ) _, state, _ = fix_room break except: pass else: state = alg_demo.generate_level() for i in range(len(version_li)): version = version_li[i] load_path = '{}/agent_v{}.zip'.format(load_dir, version) agent.set_parameters(load_path, exact_match=True) # agent = agent_li[i] done = False obs = soko_env.manual_reset(state) while not done: action, _ = agent.predict(obs, deterministic=True) obs, _, done, info = soko_env.step(action) # solved if info["all_boxes_on_target"]: num_solved_li[i] += 1 if unique_solver_idx == -1: unique_solver_idx = i else: unique_solver_idx = -1 if unique_solver_idx != -1: num_unique_solved_li[unique_solver_idx] += 1 for i in range(len(version_li)): print('{} solved {}, uniquely solved {}'.format( version_li[i], num_solved_li[i], num_unique_solved_li[i]))