def make_env(i, this_seed): # Previously, we directly called `gym.make(env_name)`, but running # `imitation.scripts.train_adversarial` within `imitation.scripts.parallel` # created a weird interaction between Gym and Ray -- `gym.make` would fail # inside this function for any of our custom environment unless those # environments were also `gym.register()`ed inside `make_env`. Even # registering the custom environment in the scope of `make_vec_env` didn't # work. For more discussion and hypotheses on this issue see PR #160: # https://github.com/HumanCompatibleAI/imitation/pull/160. env = spec.make() # Seed each environment with a different, non-sequential seed for diversity # (even if caller is passing us sequentially-assigned base seeds). int() is # necessary to work around gym bug where it chokes on numpy int64s. env.seed(int(this_seed)) if max_episode_steps is not None: env = TimeLimit(env, max_episode_steps) elif spec.max_episode_steps is not None: env = TimeLimit(env, max_episode_steps=spec.max_episode_steps) # Use Monitor to record statistics needed for Baselines algorithms logging # Optionally, save to disk log_path = None if log_dir is not None: log_subdir = os.path.join(log_dir, 'monitor') os.makedirs(log_subdir, exist_ok=True) log_path = os.path.join(log_subdir, f'mon{i:03d}') return MonitorPlus(env, log_path, allow_early_resets=True)
def test_random_task_on_each_episode_and_only_one_task_in_schedule(): """ BUG: When the goal is to have only one task, it instead keeps sampling a new task from the 'distribution', in the case of cartpole! """ env: MetaMonsterKongEnv = gym.make("CartPole-v1") from gym.wrappers import TimeLimit env = TimeLimit(env, max_episode_steps=10) env = MultiTaskEnvironment( env, task_schedule={ 0: { "length": 0.1 }, }, add_task_id_to_obs=True, new_random_task_on_reset=True, ) task_labels = [] lengths = [] for i in range(10): obs = env.reset() task_labels.append(obs[1]) lengths.append(env.length) done = False while not done: obs, reward, done, info = env.step(env.action_space.sample()) task_labels.append(obs[1]) lengths.append(env.length) assert set(task_labels) == {0} assert set(lengths) == {0.1}
def play_one_session( env: TimeLimit, max_size: int, action_chooser: Callable[[TimeLimit, Any], Any], render: bool = False, custom_actions: Callable[[int, TimeLimit, Any, Any, Any, bool, Any], None] = None, stop_when_done: bool = True, ) -> Tuple[float, List[Dict[str, Any]]]: observation = env.reset() score = 0 history = [] for i in range(max_size): if render: env.render() action = action_chooser(env, observation) current_iteration_history = {"observation": observation, "action": action} observation, reward, done, info = env.step(action.reshape((-1,))) score += reward history.append(current_iteration_history) if custom_actions is not None: custom_actions(i, env, action, observation, reward, done, info) if stop_when_done and done: break return score / max_size, history
def __init__(self, winning_probs, max_steps): """Initialize test class.""" self.winning_probs = winning_probs self.max_steps = max_steps self.env = TimeLimit( NonMarkovianRotatingMAB(winning_probs=self.winning_probs), max_episode_steps=self.max_steps, )
def func(): env = gym.make(gym_id) env = TimeLimit(env, max_episode_steps = args.max_episode_len) env = EnvironmentWrapper(env.env, normOb=normOb, rewardNormalization=rewardNormalization, clipOb=clipOb, clipRew=clipRew, **kwargs) env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) return env
def _thunk(): env = make_benchmarking_env(env_id) env = TimeLimit(env, max_episode_steps) env.seed(seed + rank) log_dir_ = os.path.join(log_dir, str(rank)) if log_dir is not None else log_dir env = Monitor(env, log_dir_, allow_early_resets=allow_early_resets) return env
def play_with_car(): maximum_steps_allowed = 250 env = TimeLimit(MountainCarEnv(), max_episode_steps=maximum_steps_allowed + 1) actions = {'left': 0, 'stop': 1, 'right': 2} initial_state = env.reset() print('Initial state: ', initial_state) for t in range(maximum_steps_allowed): # need to modify policy if t < 50: s, r, done, _ = env.step(actions['left']) elif t < 70: s, r, done, _ = env.step(actions['right']) elif t < 120: s, r, done, _ = env.step(actions['left']) else: s, r, done, _ = env.step(actions['right']) print('State {}, Reward {}, Step {}'.format(s, r, t)) env.render() if done: if s[0] > 0.47: print('Well done!') else: print('Please, try again.') break else: print('Time is up. Please, try again.')
def test_noop_reset_env(self): # runable test noop_max = 20 env = gym.make(TEST_ENV_ID) env = TimeLimit(env, 3) env = atari.NoopResetEnv(env, noop_max=noop_max) env.reset() for i in range(20): obs, rew, done, info = env.step(env.action_space.sample()) if done: break
def test_time_limit_reset_info(): env = gym.make("CartPole-v1") env = TimeLimit(env) ob_space = env.observation_space obs = env.reset() assert ob_space.contains(obs) del obs obs = env.reset(return_info=False) assert ob_space.contains(obs) del obs obs, info = env.reset(return_info=True) assert ob_space.contains(obs) assert isinstance(info, dict)
def test_random_task_on_each_episode(): env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1") from gym.wrappers import TimeLimit env = TimeLimit(env, max_episode_steps=10) env = MultiTaskEnvironment( env, task_schedule={ 0: {"level": 0}, 5: {"level": 1}, 200: {"level": 2}, 300: {"level": 3}, 400: {"level": 4}, }, add_task_id_to_obs=True, new_random_task_on_reset=True, ) task_labels = [] for i in range(10): obs = env.reset() task_labels.append(obs["task_labels"]) assert len(set(task_labels)) > 1 # Episodes only last 10 steps. Tasks don't have anything to do with the task # schedule. obs = env.reset() start_task_label = obs["task_labels"] for i in range(10): obs, reward, done, info = env.step(env.action_space.sample()) assert obs["task_labels"] == start_task_label if i == 9: assert done else: assert not done env.close()
def test_adding_envs(self): from sequoia.common.gym_wrappers.env_dataset import EnvDataset env_1 = EnvDataset( EpisodeLimit(TimeLimit(gym.make("CartPole-v1"), max_episode_steps=10), max_episodes=5)) env_2 = EnvDataset( EpisodeLimit(TimeLimit(gym.make("CartPole-v1"), max_episode_steps=10), max_episodes=5)) chained_env = env_1 + env_2 assert chained_env._envs[0] is env_1 assert chained_env._envs[1] is env_2
def test_change_gravity_each_step(self): env: ModifiedMassEnv = self.Environment() max_episode_steps = 500 n_episodes = 5 # NOTE: Interestingly, the renderer will show # `env.frame_skip * max_episode_steps` frames per episode, even when # "Ren[d]er every frame" is set to False. env = TimeLimit(env, max_episode_steps=max_episode_steps) env: ModifiedMassEnv total_steps = 0 for episode in range(n_episodes): initial_state = env.reset() done = False episode_steps = 0 start_y = initial_state[1] moved_up = 0 previous_state = initial_state state = initial_state body_part = self.body_names[0] start_mass = env.get_mass(body_part) while not done: previous_state = state state, reward, done, info = env.step(env.action_space.sample()) env.render("human") episode_steps += 1 total_steps += 1 env.set_mass(body_part=body_part, mass=start_mass + 5 * total_steps / max_episode_steps) moved_up += (state[1] > previous_state[1]) # print(f"Moving upward? {obs[1] > state[1]}") print(f"Gravity at end of episode: {env.gravity}") # TODO: Check that the position (in the observation) is obeying gravity? # if env.gravity <= 0: # # Downward force, so should not have any significant preference for # # moving up vs moving down. # assert 0.4 <= (moved_up / max_episode_steps) <= 0.6, env.gravity # # if env.gravity == 0: # # assert 0.5 <= (moved_up / max_episode_steps) <= 1.0 # if env.gravity > 0: # assert 0.5 <= (moved_up / max_episode_steps) <= 1.0, env.gravity assert total_steps == n_episodes * max_episode_steps initial_z = env.init_qpos[1] final_z = env.sim.data.qpos[1] assert initial_z == 0 # Check that the robot is high up in the sky! :D assert final_z > 20
def process_env(args, bot): parsing_metric = DictList() for episode_id in tqdm.tqdm(range(args.episodes)): env = TimeLimit(gym.make(random.choice(args.envs)), 100) if args.seed is not None: env.seed(args.seed + episode_id) demo_bot = demo.DemoBot(env=env) while True: try: ret, _demo_traj, viz = demo.generate_one_traj( demo_bot, env, render_mode='ansi') if ret < len(env.sketchs): continue demo_traj = DictList(_demo_traj) demo_traj.done = [False] * (len(demo_traj) - 1) + [True] demo_traj.action = [a.value for a in _demo_traj['action']] demo_traj.env_id = [env.env_id] * len(demo_traj) demo_traj.apply(lambda _t: torch.tensor(_t).unsqueeze(0) if not isinstance(_t[0], str) else _t) break except demo.PlanningError: pass with torch.no_grad(): traj = teacher_force(bot, demo_traj) traj.viz = viz ps = traj.p ps[0, :-1] = 0 ps[0, -1] = 1 # Compute F1 use_ids = (traj.action.reshape(-1)[1:-1] == Actions.USE.value ).nonzero().view(-1).cpu().numpy() target = use_ids.tolist() p_vals = torch.arange(bot.nb_slots + 1) avg_p = (p_vals * ps[1:-1]).sum(-1) for k in [2, 3, 4, 5, 6]: _, inds = (-avg_p).topk(k) preds = inds.tolist() for tol in [1]: result = f1(target, preds, tol, with_detail=True) for name in result: parsing_metric.append( {'{}_tol{}_k{}'.format(name, tol, k): result[name]}) parsing_metric.apply(lambda _t: np.mean(_t)) return parsing_metric
def __init__(self, include_goal: bool = False): self.n_tasks = 50 self.tasks = list(HARD_MODE_ARGS_KWARGS['train'].keys()) + list( HARD_MODE_ARGS_KWARGS['test'].keys()) self._max_episode_steps = 150 self.include_goal = include_goal self._task_idx = None self._env = None self._envs = [] _cls_dict = { **HARD_MODE_CLS_DICT['train'], **HARD_MODE_CLS_DICT['test'] } _args_kwargs = { **HARD_MODE_ARGS_KWARGS['train'], **HARD_MODE_ARGS_KWARGS['test'] } for idx in range(self.n_tasks): task = self.tasks[idx] args_kwargs = _args_kwargs[task] if idx == 28 or idx == 29: args_kwargs['kwargs']['obs_type'] = 'plain' args_kwargs['kwargs']['random_init'] = False else: args_kwargs['kwargs']['obs_type'] = 'with_goal' args_kwargs['task'] = task env = _cls_dict[task](*args_kwargs['args'], **args_kwargs['kwargs']) self._envs.append( TimeLimit(env, max_episode_steps=self._max_episode_steps)) self.set_task_idx(0)
def wrap_atari(env, max_episode_steps=None): assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) if max_episode_steps is not None: env = TimeLimit(env, max_episode_steps=max_episode_steps) return env
def setup_class(cls): """Set up the test.""" config = copy(cls.CONFIG) config.update(cls.OVERWRITE_CONFIG) nb_samples = config["nb_samples"] stop_probability = config["stop_probability"] nb_processes = config["nb_processes"] delta = config["delta"] epsilon = config["epsilon"] n_upperbound = config["n_upperbound"] env = cls.make_env() env = TimeLimit(env, max_episode_steps=cls.MAX_EPISODE_STEPS) cls.agent = PacRdpAgent(env.observation_space, env.action_space, env) dataset = cls.sample( env, nb_samples=nb_samples, stop_probability=stop_probability, nb_processes=nb_processes, ) pdfa = learn_pdfa( dataset=dataset, n=n_upperbound, alphabet_size=cls.agent._rdp_generator.alphabet_size(), delta=delta, epsilon=epsilon, with_infty_norm=False, with_smoothing=True, ) cls.pdfa = pdfa decoder = cls.agent._rdp_generator.decoder to_graphviz(cls.pdfa, char2str=lambda c: decoder(c) if c != FINAL_SYMBOL else "-1").render(cls.__name__)
def create_train_env(n_envs, env_name=None, max_episode_steps=None, seed=None): envs=[] for k in range(n_envs): e = create_gym_env(env_name) e = TimeLimit(e, max_episode_steps=max_episode_steps) envs.append(e) return GymEnvInf(envs, seed)
def wrap_time_limit(env, time_aware, max_episode_steps): """Add or update an environment's time limit Arguments: env (gym.Env): a gym environment instance time_aware (bool): whether to append the relative timestep to the observation. max_episode_steps (int): the maximum number of timesteps in a single episode Returns: A wrapped environment with the desired time limit """ assert not time_aware or max_episode_steps, "Time-aware envs must specify a horizon" if max_episode_steps: env_, has_timelimit = env, False while hasattr(env_, "env"): if isinstance(env_, TimeLimit): has_timelimit = True break env_ = env_.env if has_timelimit: # pylint: disable=protected-access env_._max_episode_steps = max_episode_steps # pylint: enable=protected-access else: env = TimeLimit(env, max_episode_steps=max_episode_steps) if time_aware: env = AddRelativeTimestep(env) return env
def env_fn_cartpole() -> gym.Env: env = gym.make("CartPole-v0") env = TimeLimit(env, max_episode_steps=10) env = MultiTaskEnvironment( env, task_schedule={ 0: { "length": 0.1 }, 100: { "length": 0.2 }, 200: { "length": 0.3 }, 300: { "length": 0.4 }, 400: { "length": 0.5 }, }, add_task_id_to_obs=True, new_random_task_on_reset=True, ) return env
def make_cliff(max_episode_steps: int = 50): """Make the Cliff environment for testing.""" env = CliffWalkingEnv() cliffs = [np.ravel_multi_index((3, y), env.shape) for y in range(1, 11)] cliff_reward = -100 cliff_done = True # make transitions to cliff as final... # ...from initial state env.P[env.start_state_index][RIGHT] = [(1.0, cliffs[0], cliff_reward, cliff_done)] # ...from states above the cliff for cliff_x in range(1, 11): current_state = np.ravel_multi_index((2, cliff_x), env.shape) env.P[current_state][DOWN] = [(1.0, cliffs[cliff_x - 1], cliff_reward, cliff_done)] # ...from the final state terminal_state = (env.shape[0] - 1, env.shape[1] - 1) terminal_state_index = np.ravel_multi_index(terminal_state, env.shape) env.P[terminal_state_index][UP] = [] env.P[terminal_state_index][RIGHT] = [] env.P[terminal_state_index][DOWN] = [] env.P[terminal_state_index][LEFT] = [(1.0, cliffs[-1], cliff_reward, cliff_done)] # make no transitions from cliff for cliff_state in cliffs: for action in [UP, RIGHT, DOWN, LEFT]: env.P[cliff_state][action] = [] env = CliffWalkingEnvWrapper(env) env = TimeLimit(env, max_episode_steps=max_episode_steps) return env
def load_env(env_name, time_limit=None, obs_scaler=None, wrappers=None, wrappers_prescale=None, **kwargs): """Load an environment, configurable via gin.""" print(f"Make environment {env_name} {wrappers} {kwargs}") env = gym.make(env_name, **kwargs) if time_limit: env = TimeLimit(env, time_limit) if wrappers_prescale is None: wrappers_prescale = [] for wrapper in wrappers_prescale[::-1]: env = wrapper(env) if obs_scaler: from encoder.observation_encoder import ObservationScaleWrapper env = ObservationScaleWrapper(env, obs_scaler) if wrappers is None: wrappers = [] for wrapper in wrappers[::-1]: env = wrapper(env) return env
def create_env(max_episode_steps=100): envs = [] for k in range(4): e = gym.make("CartPole-v0") e = TimeLimit(e, max_episode_steps=max_episode_steps) envs.append(e) return GymEnv(envs, seed=10)
def make_env(env_name, policy_type, max_episode_steps, env_obs_space_name=None): """ Wrap the environment into a set of wrappers depending on some hyper-parameters Used so that most environments can be used with the same policies and algorithms :param env_name: the name of the environment, as a string. For instance, "MountainCarContinuous-v0" :param policy_type: a string specifying the type of policy. So far, "bernoulli" or "normal" :param max_episode_steps: the max duration of an episode. If None, uses the default gym max duration :param env_obs_space_name: a vector of names of the environment features. E.g. ["position","velocity"] for MountainCar :return: the wrapped environment """ env = gym.make(env_name) # tests whether the environment is discrete or continuous if not env.action_space.contains(np.array([0.5])): assert policy_type == "bernoulli" or policy_type == "discrete", 'cannot run a continuous action policy in a ' \ 'discrete action environment' if max_episode_steps is not None: env = TimeLimit(env, max_episode_steps) env.observation_space.names = env_obs_space_name env = FeatureInverter(env, 1, 2) # MODIFIED: Invert sin(theta) and theta dot env = PerfWriter(env) print(env) return env
def create_env(seed=0, max_episode_steps=100): envs = [] for k in range(4): e = MyEnv() e = TimeLimit(e, max_episode_steps=max_episode_steps) envs.append(e) return GymEnv(envs, seed=seed)
def test_cliff(): """Test that Sarsa > QLearning in the Cliff Environment.""" env = CliffWalkingEnv() env = CliffWalkingEnvWrapper(env) env = TimeLimit(env, max_episode_steps=50) def make_sarsa(): return TabularSarsa(env.observation_space, env.action_space).agent() def make_qlearning(): return TabularQLearning(env.observation_space, env.action_space).agent() nb_episodes = 500 nb_runs = 5 policy = EpsGreedyPolicy(0.1) _, sarsa_histories = run_experiments( make_sarsa, env, policy, nb_runs=nb_runs, nb_episodes=nb_episodes ) _, qlearning_histories = run_experiments( make_qlearning, env, policy, nb_runs=nb_runs, nb_episodes=nb_episodes ) assert len(sarsa_histories) == 5 assert len(qlearning_histories) == 5 sarsa_total_rewards = np.asarray([h.total_rewards for h in sarsa_histories]) qlearning_total_rewards = np.asarray([h.total_rewards for h in qlearning_histories]) sarsa_last_reward_avg = sarsa_total_rewards.mean(axis=0)[-100:].mean() qlearning_last_reward_avg = qlearning_total_rewards.mean(axis=0)[-100:].mean() # compare sarsa and q-learning on the averaged total reward in the last episode # sarsa is always better than q-learning assert sarsa_last_reward_avg > -30 > qlearning_last_reward_avg > -40
def prepare_env(): # randomizer = PioneerSceneRandomizer(source='/Users/xdralex/Work/curiosity/pioneer2/pioneer/envs/assets/pioneer4.xml', # target_space=spaces.Box(low=np.array([5.0, -3, 1], dtype=np.float32), # high=np.array([6.0, 3, 3], dtype=np.float32)), # obstacle_pos_space=spaces.Box(low=np.array([3, -2], dtype=np.float32), # high=np.array([5, 2], dtype=np.float32)), # obstacle_size_space=spaces.Box(low=np.array([0.1, 0.1, 3], dtype=np.float32), # high=np.array([0.1, 0.1, 5], dtype=np.float32))) randomizer = PioneerSceneRandomizer(source='/Users/xdralex/Work/curiosity/pioneer/pioneer/envs/assets/pioneer6.xml', target_space=spaces.Box(low=np.array([5.0, -3, 1], dtype=np.float32), high=np.array([6.0, 3, 3], dtype=np.float32)), obstacle_pos_space=spaces.Box(low=np.array([3, -2], dtype=np.float32), high=np.array([5, 2], dtype=np.float32)), obstacle_size_space=spaces.Box(low=np.array([0.001, 0.001, 0.001], dtype=np.float32), high=np.array([0.001, 0.001, 0.001], dtype=np.float32))) pioneer_config = { 'potential_scale': 5, 'step_penalty': 1 / 125, 'stop_distance': 0.05 } pioneer_env = RandomizedPioneerEnv(pioneer_config, randomizer, temp_dir='/Users/xdralex/pioneer/environments', retain_samples=True) return TimeLimit(pioneer_env, max_episode_steps=250)
def env_fn_monsterkong() -> gym.Env: env = gym.make("MetaMonsterKong-v0") env = TimeLimit(env, max_episode_steps=10) env = MultiTaskEnvironment( env, task_schedule={ 0: { "level": 1 }, 100: { "level": 2 }, 200: { "level": 3 }, 300: { "level": 4 }, 400: { "level": 5 }, }, add_task_id_to_obs=True, new_random_task_on_reset=True, ) return env
def rollout( env: gym.Env, nb_episodes: int = 1, max_steps: Optional[int] = None, policy=lambda env, state: _random_action(env, state), callback=lambda env, step: None, ): """ Do a rollout. :param env: the OpenAI Gym environment. :param nb_episodes: the number of rollout episodes. :param max_steps: maximum number of steps per episodes. :param policy: a callable that takes the enviornment and the state and returns the action. :param callback: a callback that takes the environment and it is called at each step. :return: None """ if max_steps: env = TimeLimit(env, max_episode_steps=max_steps) for _ in range(nb_episodes): state = env.reset() done = False callback(env, (state, 0.0, done, {})) while not done: action = policy(env, state) state, reward, done, info = env.step(action) callback(env, (state, reward, done, info))
def __init__(self, env: Union[Env, Wrapper, str], noop_max: int = 30, frameskip: int = 4, screen_size: int = 84, terminal_on_life_loss: bool = True, grayscale_obs: bool = True, scale_obs: bool = False, stacked_frames: Optional[int] = 4, fire_to_reset: bool = True, max_steps_per_episode: Optional[int] = None, partial_observation_wrapper: Optional[ PartialObservationWrapper] = None, partial_percentage: Optional[float] = 0.82, seed: Optional[int] = None): """Initializes a new AtariWrapper Args: env: an Atari gym environment noop_max: maximum number of Noop actions at reset frameskip: number of times a given action is repeated screen_size: resized Atari frame terminal_on_life_loss: if True, episode terminates when life is lost grayscale_obs: if True, converts RGB to gray-scale scale_obs: if True, scales observation to [0, 1] stacked_frames: length of history. If None, length is 1 fire_to_reset: if True, performs action 1 on reset max_steps_per_episode: maximum number of frames before truncating an episode partial_observation_wrapper: a wrapper to provide partial observability If None, no transformation is done """ if seed is not None: env.seed(seed) env = AtariPreprocessing(env, noop_max=noop_max, frame_skip=frameskip, screen_size=screen_size, terminal_on_life_loss=terminal_on_life_loss, grayscale_obs=grayscale_obs, scale_obs=scale_obs) # time step limit wrapper if max_steps_per_episode: env = TimeLimit(env, max_episode_steps=max_steps_per_episode) # fire when reset if fire_to_reset: env = FireResetEnv(env) # partial observation wrapper if partial_observation_wrapper: env = partial_observation_wrapper(env, window_ratio=partial_percentage) # stacked frames for history length if stacked_frames: env = PartialBufferWrapper(env, stacked_frames) super().__init__(env)
def create_env(n_envs, max_episode_steps=None, seed=None,**args): envs=[] for k in range(n_envs): e = create_gym_env(args) e = TimeLimit(e, max_episode_steps=max_episode_steps) envs.append(e) return GymEnv(envs, seed)