def test_basics(): env = TimeLimit(gym.make("CartPole-v0"), max_episode_steps=10) env = EnvDataset(env) env = EpisodeLimit(env, max_episodes=3) env.seed(123) for episode in range(3): obs = env.reset() done = False step = 0 while not done: print(f"step {step}") obs, reward, done, info = env.step(env.action_space.sample()) step += 1 assert env.is_closed() with pytest.raises(gym.error.ClosedEnvironmentError): _ = env.reset() with pytest.raises(gym.error.ClosedEnvironmentError): _ = env.step(env.action_space.sample()) with pytest.raises(gym.error.ClosedEnvironmentError): for _ in env: break
def func(): env = gym.make(gym_id) env = TimeLimit(env, max_episode_steps = args.max_episode_len) env = EnvironmentWrapper(env.env, normOb=normOb, rewardNormalization=rewardNormalization, clipOb=clipOb, clipRew=clipRew, **kwargs) env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) return env
def _thunk(): env = make_benchmarking_env(env_id) env = TimeLimit(env, max_episode_steps) env.seed(seed + rank) log_dir_ = os.path.join(log_dir, str(rank)) if log_dir is not None else log_dir env = Monitor(env, log_dir_, allow_early_resets=allow_early_resets) return env
def test_max_and_skip_env(self): # runable test skip = 4 env = gym.make(TEST_ENV_ID) env = TimeLimit(env, 20) env = atari.MaxAndSkipEnv(env, skip=skip) env.seed(1) ub_utils.set_seed(1) env.reset() for i in range(20): obs, rew, done, info = env.step(env.action_space.sample()) if done: break self.assertEqual(4, i)
def process_env(args, bot): parsing_metric = DictList() for episode_id in tqdm.tqdm(range(args.episodes)): env = TimeLimit(gym.make(random.choice(args.envs)), 100) if args.seed is not None: env.seed(args.seed + episode_id) demo_bot = demo.DemoBot(env=env) while True: try: ret, _demo_traj, viz = demo.generate_one_traj( demo_bot, env, render_mode='ansi') if ret < len(env.sketchs): continue demo_traj = DictList(_demo_traj) demo_traj.done = [False] * (len(demo_traj) - 1) + [True] demo_traj.action = [a.value for a in _demo_traj['action']] demo_traj.env_id = [env.env_id] * len(demo_traj) demo_traj.apply(lambda _t: torch.tensor(_t).unsqueeze(0) if not isinstance(_t[0], str) else _t) break except demo.PlanningError: pass with torch.no_grad(): traj = teacher_force(bot, demo_traj) traj.viz = viz ps = traj.p ps[0, :-1] = 0 ps[0, -1] = 1 # Compute F1 use_ids = (traj.action.reshape(-1)[1:-1] == Actions.USE.value ).nonzero().view(-1).cpu().numpy() target = use_ids.tolist() p_vals = torch.arange(bot.nb_slots + 1) avg_p = (p_vals * ps[1:-1]).sum(-1) for k in [2, 3, 4, 5, 6]: _, inds = (-avg_p).topk(k) preds = inds.tolist() for tol in [1]: result = f1(target, preds, tol, with_detail=True) for name in result: parsing_metric.append( {'{}_tol{}_k{}'.format(name, tol, k): result[name]}) parsing_metric.apply(lambda _t: np.mean(_t)) return parsing_metric
def _thunk(): env = gym.make(id) if not timelimit: env = env.env elif timelimit_maxsteps: env = TimeLimit(env.env, timelimit_maxsteps) assert 'NoFrameskip' in env.spec.id if noop: env = atari_wrappers.NoopResetEnv(env, noop_max=30) env = atari_wrappers.MaxAndSkipEnv(env, skip=frameskip) env = StepOnEndOfLifeEnv(env) env = EpisodeInfo(env) env.seed(seed + rank) env = atari_wrappers.wrap_deepmind( env, episode_life=episode_life, clip_rewards=clip_rewards, frame_stack=False, scale=scale) env = ImageTranspose(env) return env
class BaseTestRotMAB: """Base test class for RotMAB environment.""" def __init__(self, winning_probs, max_steps): """Initialize test class.""" self.winning_probs = winning_probs self.max_steps = max_steps self.env = TimeLimit( NonMarkovianRotatingMAB(winning_probs=self.winning_probs), max_episode_steps=self.max_steps, ) def test_action_space(self): """Test action spaces.""" assert self.env.action_space == Discrete(len(self.winning_probs)) def test_observation_space(self): """Test observation spaces.""" assert self.env.observation_space == Discrete(2) def test_interaction(self): """Test interaction with Rotating MAB.""" self.env.seed() state = self.env.reset() assert state == 0 def assert_consistency(obs, reward): """Assert obs = 1 iff reward = 1.""" positive_reward = reward > 0.0 positive_obs = obs == 1 assert (positive_reward and positive_obs or (not positive_reward and not positive_obs)) for _i in range(self.max_steps - 1): action = self.env.action_space.sample() obs, reward, done, info = self.env.step(action) assert_consistency(obs, reward) assert not done # last action obs, reward, done, info = self.env.step(0) assert_consistency(obs, reward) assert done
env._max_episode_steps = int(args.episode_length) args.episode_length = env._max_episode_steps else: env = TimeLimit(env, int(args.episode_length)) env = NormalizedEnv(env.env, ob=args.norm_obs, ret=args.norm_returns, clipob=args.obs_clip, cliprew=args.rew_clip, gamma=args.gamma) env = TimeLimit(env, int(args.episode_length)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') # ALGO LOGIC: initialize agent here: class CategoricalMasked(Categorical): def __init__(self, probs=None, logits=None, validate_args=None, masks=[]): self.masks = masks if len(self.masks) == 0: super(CategoricalMasked, self).__init__(probs, logits, validate_args) else: self.masks = masks.type(torch.BoolTensor).to(device)
def process_env(args, bot, env_name): succs = [] rets = [] lines = [] parsing_metric = DictList() model_name = os.path.dirname(os.path.abspath( args.model_ckpt)).split('/')[-1] + '_{}'.format(env_name) if args.use_demo: model_name = model_name + '_demo' render_mode = 'rgb' if args.mp4 else 'ansi' for episode_id in tqdm.tqdm(range(args.episodes)): env = TimeLimit( gym.make(env_name, height=args.height, width=args.width), args.max_steps) if args.seed is not None: env.seed(args.seed + episode_id) if args.use_demo: demo_bot = demo.DemoBot(env=env) while True: try: ret, _demo_traj, viz = demo.generate_one_traj( demo_bot, env, render_mode=render_mode) if ret < len(env.sketchs): continue demo_traj = DictList(_demo_traj) demo_traj.done = [False] * (len(demo_traj) - 1) + [True] demo_traj.action = [a.value for a in _demo_traj['action']] demo_traj.env_id = [env.env_id] * len(demo_traj) demo_traj.apply(lambda _t: torch.tensor(_t).unsqueeze(0) if not isinstance(_t[0], str) else _t) break except demo.PlanningError: pass with torch.no_grad(): traj = teacher_force(bot, demo_traj) traj.viz = viz success = True else: with torch.no_grad(): traj, success, ret = generate(bot, env) ret = ret.item() succs.append(float(success)) rets.append(ret) lines.append('########## [{}]EPISODE {} ##############'.format( env_name, episode_id)) lines.append('return: {} success: {}'.format(ret, success)) has_p = 'p' in traj states = traj.viz rewards = traj.reward actions = traj.action if has_p: ps = traj.p ps[0, :-1] = 0 ps[0, -1] = 1 lines.append('P: ') lines.append(idxpos2tree(actions, ps)) if 'p_out' in traj: lines.append('P_out:') lines.append(idxpos2tree(actions, traj.p_out)) slots = torch.arange(len(ps[0])) p_avg = len(slots) - (slots[None, :] * ps).sum(-1) p_avg_str = plot(p_avg, actions=actions) lines.append('p_avg:') lines.append(p_avg_str) lines.append('Tree') depths = p_avg[:-1] depths = (depths - depths.min()) / (depths.max() - depths.min()) depths = np.digitize(depths.numpy(), bins=np.linspace(0, 1, 5)) parse_tree = distance2ctree( depths, [ACTION_VOCAB[a.item()] for a in actions], False) tree_line = tree_to_str(parse_tree) lines.append(tree_line[1:-1]) # Compute F1 if args.use_demo: use_ids = (traj.action.reshape(-1)[1:-1] == Actions.USE.value ).nonzero().view(-1).cpu().numpy() target = use_ids.tolist() p_vals = torch.arange(bot.nb_slots + 1) avg_p = (p_vals * ps[1:-1]).sum(-1) for k in [3, 4, 5, 6]: _, inds = (-avg_p).topk(k) preds = inds.tolist() for tol in [0]: result = f1(target, preds, tol, with_detail=True) for name in result: parsing_metric.append({ '{}_tol{}_k{}'.format(name, tol, k): result[name] }) # Generate Plots if args.plot_p: p_avg_fig = get_p_plot(actions, ps) p_avg_fig.savefig(os.path.join( args.outdir, model_name + '_{}.png'.format(episode_id)), bbox_inches='tight') plt.close(p_avg_fig) # Save episode details if not args.mp4: episode_lines = [] for t in range(len(rewards)): episode_lines.append('################################') episode_lines.append('Sketch: {}'.format(env.sketchs)) info_line = "steps: {}\taction: {}\treward: {}".format( t, ACTION_VOCAB[actions[t].item()], rewards[t]) if has_p: info_line += '\tp: {}'.format( np.array2string( ps[t].clamp(min=1e-8).numpy(), formatter={'float_kind': lambda x: visual(x, 1)})) episode_lines.append(info_line) episode_lines.append(states[t]) episode_res_name = os.path.join( args.outdir, model_name + '_{}.txt'.format(episode_id)) with open(episode_res_name, 'w') as f: f.write('\n'.join(episode_lines)) # Save MP4 else: returns = rewards.cumsum(0) if not has_p: frames = states else: frames = [] sketch_id = 0 prev_ret = 0 for time_step, state_frame in enumerate(states): curr_ret = returns[time_step - 1].item() if time_step - 1 > 0 else 0 curr_rwd = rewards[time_step - 1].item() if time_step - 1 > 0 else 0 info_line = "steps: {}, reward: {}, ret: {} \n subtask: {}".format( time_step, curr_rwd, curr_ret, env.sketchs[sketch_id]) special = curr_ret > prev_ret if special: sketch_id += 1 prev_ret = returns[time_step - 1] info_line += "(done)" p_fig = get_p_plot(actions, ps, time_step, info_line) w, h = p_fig.canvas.get_width_height() p_fig.tight_layout(pad=0) p_fig.canvas.draw() p_img = np.fromstring(p_fig.canvas.tostring_rgb(), dtype=np.uint8).reshape(h, w, 3) plt.close(p_fig) # Concat final_frame = np.concatenate([p_img, state_frame], axis=1) frames.append(final_frame) if special: for _ in range(2): frames.append(final_frame) # Write to mp4 print('Producing videos...') frames = [Image.fromarray(frame) for frame in frames] # Repeat ending frame for additional time frames.append(frames[-1]) videodims = (frames[0].width, frames[0].height) video = cv2.VideoWriter( os.path.join(args.outdir, model_name + "_{}.mp4".format(episode_id)), 0x7634706d, 1, videodims) for frame in frames: video.write(cv2.cvtColor(np.array(frame), cv2.COLOR_RGB2BGR)) video.release() lines.append( '######################{}#########################'.format(env_name)) lines.append('Avg return {}'.format(sum(rets) / args.episodes)) lines.append('Avg success rate {}'.format(sum(succs) / args.episodes)) print('{} return: {} success {}'.format(env_name, np.mean(rets), np.mean(succs))) parsing_metric.apply(lambda _t: np.mean(_t)) for key, val in parsing_metric.items(): print(key, val) with open(os.path.join(args.outdir, model_name + '.out'), 'w') as f: f.write('\n'.join(lines))
def _init(): env = gym.make(env_name) env = TimeLimit(env, timestep_limit) env = Monitor(env, log_folder + 'seed_' + str(seed + rank)) env.seed(seed + rank) return env