コード例 #1
0
ファイル: util.py プロジェクト: pedrofreire/imitation
    def make_env(i, this_seed):
        # Previously, we directly called `gym.make(env_name)`, but running
        # `imitation.scripts.train_adversarial` within `imitation.scripts.parallel`
        # created a weird interaction between Gym and Ray -- `gym.make` would fail
        # inside this function for any of our custom environment unless those
        # environments were also `gym.register()`ed inside `make_env`. Even
        # registering the custom environment in the scope of `make_vec_env` didn't
        # work. For more discussion and hypotheses on this issue see PR #160:
        # https://github.com/HumanCompatibleAI/imitation/pull/160.
        env = spec.make()

        # Seed each environment with a different, non-sequential seed for diversity
        # (even if caller is passing us sequentially-assigned base seeds). int() is
        # necessary to work around gym bug where it chokes on numpy int64s.
        env.seed(int(this_seed))

        if max_episode_steps is not None:
            env = TimeLimit(env, max_episode_steps)
        elif spec.max_episode_steps is not None:
            env = TimeLimit(env, max_episode_steps=spec.max_episode_steps)

        # Use Monitor to record statistics needed for Baselines algorithms logging
        # Optionally, save to disk
        log_path = None
        if log_dir is not None:
            log_subdir = os.path.join(log_dir, 'monitor')
            os.makedirs(log_subdir, exist_ok=True)
            log_path = os.path.join(log_subdir, f'mon{i:03d}')
        return MonitorPlus(env, log_path, allow_early_resets=True)
コード例 #2
0
def test_random_task_on_each_episode_and_only_one_task_in_schedule():
    """ BUG: When the goal is to have only one task, it instead keeps sampling a new
    task from the 'distribution', in the case of cartpole!
    """
    env: MetaMonsterKongEnv = gym.make("CartPole-v1")
    from gym.wrappers import TimeLimit
    env = TimeLimit(env, max_episode_steps=10)
    env = MultiTaskEnvironment(
        env,
        task_schedule={
            0: {
                "length": 0.1
            },
        },
        add_task_id_to_obs=True,
        new_random_task_on_reset=True,
    )
    task_labels = []
    lengths = []
    for i in range(10):
        obs = env.reset()
        task_labels.append(obs[1])
        lengths.append(env.length)
        done = False
        while not done:
            obs, reward, done, info = env.step(env.action_space.sample())
            task_labels.append(obs[1])
            lengths.append(env.length)

    assert set(task_labels) == {0}
    assert set(lengths) == {0.1}
コード例 #3
0
def play_one_session(
    env: TimeLimit,
    max_size: int,
    action_chooser: Callable[[TimeLimit, Any], Any],
    render: bool = False,
    custom_actions: Callable[[int, TimeLimit, Any, Any, Any, bool, Any], None] = None,
    stop_when_done: bool = True,
) -> Tuple[float, List[Dict[str, Any]]]:
    observation = env.reset()

    score = 0
    history = []

    for i in range(max_size):

        if render:
            env.render()

        action = action_chooser(env, observation)
        current_iteration_history = {"observation": observation, "action": action}
        observation, reward, done, info = env.step(action.reshape((-1,)))

        score += reward
        history.append(current_iteration_history)

        if custom_actions is not None:
            custom_actions(i, env, action, observation, reward, done, info)

        if stop_when_done and done:
            break

    return score / max_size, history
コード例 #4
0
 def __init__(self, winning_probs, max_steps):
     """Initialize test class."""
     self.winning_probs = winning_probs
     self.max_steps = max_steps
     self.env = TimeLimit(
         NonMarkovianRotatingMAB(winning_probs=self.winning_probs),
         max_episode_steps=self.max_steps,
     )
コード例 #5
0
 def func():
     env = gym.make(gym_id)
     env = TimeLimit(env, max_episode_steps = args.max_episode_len)
     env = EnvironmentWrapper(env.env, normOb=normOb, rewardNormalization=rewardNormalization, clipOb=clipOb, clipRew=clipRew, **kwargs)
     env.seed(args.seed)
     env.action_space.seed(args.seed)
     env.observation_space.seed(args.seed)  
     return env
コード例 #6
0
    def _thunk():
        env = make_benchmarking_env(env_id)
        env = TimeLimit(env, max_episode_steps)

        env.seed(seed + rank)
        log_dir_ = os.path.join(log_dir,
                                str(rank)) if log_dir is not None else log_dir
        env = Monitor(env, log_dir_, allow_early_resets=allow_early_resets)

        return env
コード例 #7
0
ファイル: gym_overview.py プロジェクト: shreyassd4/hse-rl
def play_with_car():
    maximum_steps_allowed = 250
    env = TimeLimit(MountainCarEnv(),
                    max_episode_steps=maximum_steps_allowed + 1)
    actions = {'left': 0, 'stop': 1, 'right': 2}

    initial_state = env.reset()
    print('Initial state: ', initial_state)

    for t in range(maximum_steps_allowed):
        # need to modify policy
        if t < 50:
            s, r, done, _ = env.step(actions['left'])
        elif t < 70:
            s, r, done, _ = env.step(actions['right'])
        elif t < 120:
            s, r, done, _ = env.step(actions['left'])
        else:
            s, r, done, _ = env.step(actions['right'])

        print('State {}, Reward {}, Step {}'.format(s, r, t))
        env.render()

        if done:
            if s[0] > 0.47:
                print('Well done!')
            else:
                print('Please, try again.')
            break
    else:
        print('Time is up. Please, try again.')
コード例 #8
0
 def test_noop_reset_env(self):
     # runable test
     noop_max = 20
     env = gym.make(TEST_ENV_ID)
     env = TimeLimit(env, 3)
     env = atari.NoopResetEnv(env, noop_max=noop_max)
     env.reset()
     for i in range(20):
         obs, rew, done, info = env.step(env.action_space.sample())
         if done:
             break
コード例 #9
0
ファイル: test_time_limit.py プロジェクト: thepinkturtle/gym
def test_time_limit_reset_info():
    env = gym.make("CartPole-v1")
    env = TimeLimit(env)
    ob_space = env.observation_space
    obs = env.reset()
    assert ob_space.contains(obs)
    del obs
    obs = env.reset(return_info=False)
    assert ob_space.contains(obs)
    del obs
    obs, info = env.reset(return_info=True)
    assert ob_space.contains(obs)
    assert isinstance(info, dict)
コード例 #10
0
def test_random_task_on_each_episode():
    env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1")
    from gym.wrappers import TimeLimit
    env = TimeLimit(env, max_episode_steps=10)
    env = MultiTaskEnvironment(
        env,
        task_schedule={
            0: {"level": 0},
            5: {"level": 1},
            200: {"level": 2},
            300: {"level": 3},
            400: {"level": 4},
        },
        add_task_id_to_obs=True,
        new_random_task_on_reset=True,
    )
    task_labels = []
    for i in range(10):
        obs = env.reset()
        task_labels.append(obs["task_labels"])
    assert len(set(task_labels)) > 1

    # Episodes only last 10 steps. Tasks don't have anything to do with the task
    # schedule.
    obs = env.reset()
    start_task_label = obs["task_labels"]
    for i in range(10):
        obs, reward, done, info = env.step(env.action_space.sample())
        assert obs["task_labels"] == start_task_label
        if i == 9:
            assert done
        else:
            assert not done

    env.close()
コード例 #11
0
    def test_adding_envs(self):
        from sequoia.common.gym_wrappers.env_dataset import EnvDataset

        env_1 = EnvDataset(
            EpisodeLimit(TimeLimit(gym.make("CartPole-v1"),
                                   max_episode_steps=10),
                         max_episodes=5))
        env_2 = EnvDataset(
            EpisodeLimit(TimeLimit(gym.make("CartPole-v1"),
                                   max_episode_steps=10),
                         max_episodes=5))
        chained_env = env_1 + env_2
        assert chained_env._envs[0] is env_1
        assert chained_env._envs[1] is env_2
コード例 #12
0
    def test_change_gravity_each_step(self):
        env: ModifiedMassEnv = self.Environment()
        max_episode_steps = 500
        n_episodes = 5

        # NOTE: Interestingly, the renderer will show
        # `env.frame_skip * max_episode_steps` frames per episode, even when
        # "Ren[d]er every frame" is set to False.
        env = TimeLimit(env, max_episode_steps=max_episode_steps)
        env: ModifiedMassEnv
        total_steps = 0

        for episode in range(n_episodes):
            initial_state = env.reset()
            done = False
            episode_steps = 0

            start_y = initial_state[1]
            moved_up = 0
            previous_state = initial_state
            state = initial_state

            body_part = self.body_names[0]
            start_mass = env.get_mass(body_part)

            while not done:
                previous_state = state
                state, reward, done, info = env.step(env.action_space.sample())
                env.render("human")
                episode_steps += 1
                total_steps += 1
                
                env.set_mass(body_part=body_part, mass=start_mass + 5 * total_steps / max_episode_steps)
                
                moved_up += (state[1] > previous_state[1])
                
                # print(f"Moving upward? {obs[1] > state[1]}")
            
            print(f"Gravity at end of episode: {env.gravity}")
            # TODO: Check that the position (in the observation) is obeying gravity?
            # if env.gravity <= 0:
            #     # Downward force, so should not have any significant preference for
            #     # moving up vs moving down.
            #     assert 0.4 <= (moved_up / max_episode_steps) <= 0.6, env.gravity
            # # if env.gravity == 0:
            # #     assert 0.5 <= (moved_up / max_episode_steps) <= 1.0
            # if env.gravity > 0:
            #     assert 0.5 <= (moved_up / max_episode_steps) <= 1.0, env.gravity
                
        assert total_steps == n_episodes * max_episode_steps
        initial_z = env.init_qpos[1]
        final_z = env.sim.data.qpos[1]
        assert initial_z == 0
        # Check that the robot is high up in the sky! :D
        assert final_z > 20
コード例 #13
0
def process_env(args, bot):
    parsing_metric = DictList()
    for episode_id in tqdm.tqdm(range(args.episodes)):
        env = TimeLimit(gym.make(random.choice(args.envs)), 100)
        if args.seed is not None:
            env.seed(args.seed + episode_id)

        demo_bot = demo.DemoBot(env=env)
        while True:
            try:
                ret, _demo_traj, viz = demo.generate_one_traj(
                    demo_bot, env, render_mode='ansi')
                if ret < len(env.sketchs):
                    continue
                demo_traj = DictList(_demo_traj)
                demo_traj.done = [False] * (len(demo_traj) - 1) + [True]
                demo_traj.action = [a.value for a in _demo_traj['action']]
                demo_traj.env_id = [env.env_id] * len(demo_traj)
                demo_traj.apply(lambda _t: torch.tensor(_t).unsqueeze(0)
                                if not isinstance(_t[0], str) else _t)
                break
            except demo.PlanningError:
                pass
        with torch.no_grad():
            traj = teacher_force(bot, demo_traj)
            traj.viz = viz

        ps = traj.p
        ps[0, :-1] = 0
        ps[0, -1] = 1

        # Compute F1
        use_ids = (traj.action.reshape(-1)[1:-1] == Actions.USE.value
                   ).nonzero().view(-1).cpu().numpy()
        target = use_ids.tolist()
        p_vals = torch.arange(bot.nb_slots + 1)
        avg_p = (p_vals * ps[1:-1]).sum(-1)
        for k in [2, 3, 4, 5, 6]:
            _, inds = (-avg_p).topk(k)
            preds = inds.tolist()
            for tol in [1]:
                result = f1(target, preds, tol, with_detail=True)
                for name in result:
                    parsing_metric.append(
                        {'{}_tol{}_k{}'.format(name, tol, k): result[name]})

    parsing_metric.apply(lambda _t: np.mean(_t))
    return parsing_metric
コード例 #14
0
ファイル: envs.py プロジェクト: vitchyr/maml-awr
    def __init__(self, include_goal: bool = False):
        self.n_tasks = 50
        self.tasks = list(HARD_MODE_ARGS_KWARGS['train'].keys()) + list(
            HARD_MODE_ARGS_KWARGS['test'].keys())

        self._max_episode_steps = 150

        self.include_goal = include_goal
        self._task_idx = None
        self._env = None
        self._envs = []

        _cls_dict = {
            **HARD_MODE_CLS_DICT['train'],
            **HARD_MODE_CLS_DICT['test']
        }
        _args_kwargs = {
            **HARD_MODE_ARGS_KWARGS['train'],
            **HARD_MODE_ARGS_KWARGS['test']
        }
        for idx in range(self.n_tasks):
            task = self.tasks[idx]
            args_kwargs = _args_kwargs[task]
            if idx == 28 or idx == 29:
                args_kwargs['kwargs']['obs_type'] = 'plain'
                args_kwargs['kwargs']['random_init'] = False
            else:
                args_kwargs['kwargs']['obs_type'] = 'with_goal'
            args_kwargs['task'] = task
            env = _cls_dict[task](*args_kwargs['args'],
                                  **args_kwargs['kwargs'])
            self._envs.append(
                TimeLimit(env, max_episode_steps=self._max_episode_steps))

        self.set_task_idx(0)
コード例 #15
0
ファイル: atari.py プロジェクト: tornadoyi/rl-lab
def wrap_atari(env, max_episode_steps=None):
    assert 'NoFrameskip' in env.spec.id
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    if max_episode_steps is not None:
        env = TimeLimit(env, max_episode_steps=max_episode_steps)
    return env
コード例 #16
0
    def setup_class(cls):
        """Set up the test."""
        config = copy(cls.CONFIG)
        config.update(cls.OVERWRITE_CONFIG)
        nb_samples = config["nb_samples"]
        stop_probability = config["stop_probability"]
        nb_processes = config["nb_processes"]
        delta = config["delta"]
        epsilon = config["epsilon"]
        n_upperbound = config["n_upperbound"]

        env = cls.make_env()
        env = TimeLimit(env, max_episode_steps=cls.MAX_EPISODE_STEPS)
        cls.agent = PacRdpAgent(env.observation_space, env.action_space, env)
        dataset = cls.sample(
            env,
            nb_samples=nb_samples,
            stop_probability=stop_probability,
            nb_processes=nb_processes,
        )
        pdfa = learn_pdfa(
            dataset=dataset,
            n=n_upperbound,
            alphabet_size=cls.agent._rdp_generator.alphabet_size(),
            delta=delta,
            epsilon=epsilon,
            with_infty_norm=False,
            with_smoothing=True,
        )
        cls.pdfa = pdfa
        decoder = cls.agent._rdp_generator.decoder
        to_graphviz(cls.pdfa,
                    char2str=lambda c: decoder(c)
                    if c != FINAL_SYMBOL else "-1").render(cls.__name__)
コード例 #17
0
ファイル: main_a2c.py プロジェクト: XrosLiang/rlstructures
def create_train_env(n_envs, env_name=None, max_episode_steps=None, seed=None):
    envs=[]
    for k in range(n_envs):
        e = create_gym_env(env_name)
        e = TimeLimit(e, max_episode_steps=max_episode_steps)
        envs.append(e)
    return GymEnvInf(envs, seed)
コード例 #18
0
def wrap_time_limit(env, time_aware, max_episode_steps):
    """Add or update an environment's time limit

    Arguments:
        env (gym.Env): a gym environment instance
        time_aware (bool): whether to append the relative timestep to the observation.
        max_episode_steps (int): the maximum number of timesteps in a single episode

    Returns:
        A wrapped environment with the desired time limit
    """
    assert not time_aware or max_episode_steps, "Time-aware envs must specify a horizon"

    if max_episode_steps:
        env_, has_timelimit = env, False
        while hasattr(env_, "env"):
            if isinstance(env_, TimeLimit):
                has_timelimit = True
                break
            env_ = env_.env

        if has_timelimit:
            # pylint: disable=protected-access
            env_._max_episode_steps = max_episode_steps
            # pylint: enable=protected-access
        else:
            env = TimeLimit(env, max_episode_steps=max_episode_steps)

    if time_aware:
        env = AddRelativeTimestep(env)

    return env
コード例 #19
0
def env_fn_cartpole() -> gym.Env:
    env = gym.make("CartPole-v0")
    env = TimeLimit(env, max_episode_steps=10)
    env = MultiTaskEnvironment(
        env,
        task_schedule={
            0: {
                "length": 0.1
            },
            100: {
                "length": 0.2
            },
            200: {
                "length": 0.3
            },
            300: {
                "length": 0.4
            },
            400: {
                "length": 0.5
            },
        },
        add_task_id_to_obs=True,
        new_random_task_on_reset=True,
    )
    return env
コード例 #20
0
def make_cliff(max_episode_steps: int = 50):
    """Make the Cliff environment for testing."""
    env = CliffWalkingEnv()

    cliffs = [np.ravel_multi_index((3, y), env.shape) for y in range(1, 11)]
    cliff_reward = -100
    cliff_done = True
    # make transitions to cliff as final...
    # ...from initial state
    env.P[env.start_state_index][RIGHT] = [(1.0, cliffs[0], cliff_reward,
                                            cliff_done)]
    # ...from states above the cliff
    for cliff_x in range(1, 11):
        current_state = np.ravel_multi_index((2, cliff_x), env.shape)
        env.P[current_state][DOWN] = [(1.0, cliffs[cliff_x - 1], cliff_reward,
                                       cliff_done)]
    # ...from the final state
    terminal_state = (env.shape[0] - 1, env.shape[1] - 1)
    terminal_state_index = np.ravel_multi_index(terminal_state, env.shape)
    env.P[terminal_state_index][UP] = []
    env.P[terminal_state_index][RIGHT] = []
    env.P[terminal_state_index][DOWN] = []
    env.P[terminal_state_index][LEFT] = [(1.0, cliffs[-1], cliff_reward,
                                          cliff_done)]

    # make no transitions from cliff
    for cliff_state in cliffs:
        for action in [UP, RIGHT, DOWN, LEFT]:
            env.P[cliff_state][action] = []

    env = CliffWalkingEnvWrapper(env)
    env = TimeLimit(env, max_episode_steps=max_episode_steps)
    return env
コード例 #21
0
def load_env(env_name,
             time_limit=None,
             obs_scaler=None,
             wrappers=None,
             wrappers_prescale=None,
             **kwargs):
    """Load an environment, configurable via gin."""
    print(f"Make environment {env_name} {wrappers} {kwargs}")
    env = gym.make(env_name, **kwargs)
    if time_limit:
        env = TimeLimit(env, time_limit)

    if wrappers_prescale is None:
        wrappers_prescale = []
    for wrapper in wrappers_prescale[::-1]:
        env = wrapper(env)

    if obs_scaler:
        from encoder.observation_encoder import ObservationScaleWrapper
        env = ObservationScaleWrapper(env, obs_scaler)

    if wrappers is None:
        wrappers = []
    for wrapper in wrappers[::-1]:
        env = wrapper(env)
    return env
コード例 #22
0
def create_env(max_episode_steps=100):
    envs = []
    for k in range(4):
        e = gym.make("CartPole-v0")
        e = TimeLimit(e, max_episode_steps=max_episode_steps)
        envs.append(e)
    return GymEnv(envs, seed=10)
コード例 #23
0
def make_env(env_name,
             policy_type,
             max_episode_steps,
             env_obs_space_name=None):
    """
    Wrap the environment into a set of wrappers depending on some hyper-parameters
    Used so that most environments can be used with the same policies and algorithms
    :param env_name: the name of the environment, as a string. For instance, "MountainCarContinuous-v0"
    :param policy_type: a string specifying the type of policy. So far, "bernoulli" or "normal"
    :param max_episode_steps: the max duration of an episode. If None, uses the default gym max duration
    :param env_obs_space_name: a vector of names of the environment features. E.g. ["position","velocity"] for
    MountainCar
    :return: the wrapped environment
    """
    env = gym.make(env_name)
    # tests whether the environment is discrete or continuous
    if not env.action_space.contains(np.array([0.5])):
        assert policy_type == "bernoulli" or policy_type == "discrete", 'cannot run a continuous action policy in a ' \
                                                                        'discrete action environment'

    if max_episode_steps is not None:
        env = TimeLimit(env, max_episode_steps)

    env.observation_space.names = env_obs_space_name

    env = FeatureInverter(env, 1,
                          2)  # MODIFIED: Invert sin(theta) and theta dot

    env = PerfWriter(env)
    print(env)
    return env
def create_env(seed=0, max_episode_steps=100):
    envs = []
    for k in range(4):
        e = MyEnv()
        e = TimeLimit(e, max_episode_steps=max_episode_steps)
        envs.append(e)
    return GymEnv(envs, seed=seed)
コード例 #25
0
ファイル: test_cliff.py プロジェクト: fossabot/yarllib
def test_cliff():
    """Test that Sarsa > QLearning in the Cliff Environment."""
    env = CliffWalkingEnv()
    env = CliffWalkingEnvWrapper(env)
    env = TimeLimit(env, max_episode_steps=50)

    def make_sarsa():
        return TabularSarsa(env.observation_space, env.action_space).agent()

    def make_qlearning():
        return TabularQLearning(env.observation_space, env.action_space).agent()

    nb_episodes = 500
    nb_runs = 5
    policy = EpsGreedyPolicy(0.1)

    _, sarsa_histories = run_experiments(
        make_sarsa, env, policy, nb_runs=nb_runs, nb_episodes=nb_episodes
    )
    _, qlearning_histories = run_experiments(
        make_qlearning, env, policy, nb_runs=nb_runs, nb_episodes=nb_episodes
    )

    assert len(sarsa_histories) == 5
    assert len(qlearning_histories) == 5

    sarsa_total_rewards = np.asarray([h.total_rewards for h in sarsa_histories])
    qlearning_total_rewards = np.asarray([h.total_rewards for h in qlearning_histories])

    sarsa_last_reward_avg = sarsa_total_rewards.mean(axis=0)[-100:].mean()
    qlearning_last_reward_avg = qlearning_total_rewards.mean(axis=0)[-100:].mean()

    # compare sarsa and q-learning on the averaged total reward in the last episode
    # sarsa is always better than q-learning
    assert sarsa_last_reward_avg > -30 > qlearning_last_reward_avg > -40
コード例 #26
0
ファイル: pioneer_train.py プロジェクト: xdralex/pioneer
def prepare_env():
    # randomizer = PioneerSceneRandomizer(source='/Users/xdralex/Work/curiosity/pioneer2/pioneer/envs/assets/pioneer4.xml',
    #                                     target_space=spaces.Box(low=np.array([5.0, -3, 1], dtype=np.float32),
    #                                                             high=np.array([6.0, 3, 3], dtype=np.float32)),
    #                                     obstacle_pos_space=spaces.Box(low=np.array([3, -2], dtype=np.float32),
    #                                                                   high=np.array([5, 2], dtype=np.float32)),
    #                                     obstacle_size_space=spaces.Box(low=np.array([0.1, 0.1, 3], dtype=np.float32),
    #                                                                    high=np.array([0.1, 0.1, 5], dtype=np.float32)))

    randomizer = PioneerSceneRandomizer(source='/Users/xdralex/Work/curiosity/pioneer/pioneer/envs/assets/pioneer6.xml',
                                        target_space=spaces.Box(low=np.array([5.0, -3, 1], dtype=np.float32),
                                                                high=np.array([6.0, 3, 3], dtype=np.float32)),
                                        obstacle_pos_space=spaces.Box(low=np.array([3, -2], dtype=np.float32),
                                                                      high=np.array([5, 2], dtype=np.float32)),
                                        obstacle_size_space=spaces.Box(low=np.array([0.001, 0.001, 0.001], dtype=np.float32),
                                                                       high=np.array([0.001, 0.001, 0.001], dtype=np.float32)))

    pioneer_config = {
        'potential_scale': 5,
        'step_penalty': 1 / 125,
        'stop_distance': 0.05
    }

    pioneer_env = RandomizedPioneerEnv(pioneer_config, randomizer, temp_dir='/Users/xdralex/pioneer/environments', retain_samples=True)

    return TimeLimit(pioneer_env, max_episode_steps=250)
コード例 #27
0
def env_fn_monsterkong() -> gym.Env:
    env = gym.make("MetaMonsterKong-v0")
    env = TimeLimit(env, max_episode_steps=10)
    env = MultiTaskEnvironment(
        env,
        task_schedule={
            0: {
                "level": 1
            },
            100: {
                "level": 2
            },
            200: {
                "level": 3
            },
            300: {
                "level": 4
            },
            400: {
                "level": 5
            },
        },
        add_task_id_to_obs=True,
        new_random_task_on_reset=True,
    )
    return env
コード例 #28
0
def rollout(
    env: gym.Env,
    nb_episodes: int = 1,
    max_steps: Optional[int] = None,
    policy=lambda env, state: _random_action(env, state),
    callback=lambda env, step: None,
):
    """
    Do a rollout.

    :param env: the OpenAI Gym environment.
    :param nb_episodes: the number of rollout episodes.
    :param max_steps: maximum number of steps per episodes.
    :param policy: a callable that takes the enviornment and the state and returns the action.
    :param callback: a callback that takes the environment and it is called at each step.
    :return: None
    """
    if max_steps:
        env = TimeLimit(env, max_episode_steps=max_steps)
    for _ in range(nb_episodes):
        state = env.reset()
        done = False
        callback(env, (state, 0.0, done, {}))
        while not done:
            action = policy(env, state)
            state, reward, done, info = env.step(action)
            callback(env, (state, reward, done, info))
コード例 #29
0
    def __init__(self,
                 env: Union[Env, Wrapper, str],
                 noop_max: int = 30,
                 frameskip: int = 4,
                 screen_size: int = 84,
                 terminal_on_life_loss: bool = True,
                 grayscale_obs: bool = True,
                 scale_obs: bool = False,
                 stacked_frames: Optional[int] = 4,
                 fire_to_reset: bool = True,
                 max_steps_per_episode: Optional[int] = None,
                 partial_observation_wrapper: Optional[
                     PartialObservationWrapper] = None,
                 partial_percentage: Optional[float] = 0.82,
                 seed: Optional[int] = None):
        """Initializes a new AtariWrapper

        Args:
            env: an Atari gym environment
            noop_max: maximum number of Noop actions at reset
            frameskip: number of times a given action is repeated
            screen_size: resized Atari frame
            terminal_on_life_loss: if True, episode terminates when life is lost
            grayscale_obs: if True, converts RGB to gray-scale
            scale_obs: if True, scales observation to [0, 1]
            stacked_frames: length of history. If None, length is 1
            fire_to_reset: if True, performs action 1 on reset
            max_steps_per_episode: maximum number of frames before truncating an episode
            partial_observation_wrapper: a wrapper to provide partial observability
                If None, no transformation is done
        """
        if seed is not None:
            env.seed(seed)

        env = AtariPreprocessing(env,
                                 noop_max=noop_max,
                                 frame_skip=frameskip,
                                 screen_size=screen_size,
                                 terminal_on_life_loss=terminal_on_life_loss,
                                 grayscale_obs=grayscale_obs,
                                 scale_obs=scale_obs)

        # time step limit wrapper
        if max_steps_per_episode:
            env = TimeLimit(env, max_episode_steps=max_steps_per_episode)

        # fire when reset
        if fire_to_reset:
            env = FireResetEnv(env)

        # partial observation wrapper
        if partial_observation_wrapper:
            env = partial_observation_wrapper(env,
                                              window_ratio=partial_percentage)

        # stacked frames for history length
        if stacked_frames:
            env = PartialBufferWrapper(env, stacked_frames)

        super().__init__(env)
コード例 #30
0
def create_env(n_envs, max_episode_steps=None, seed=None,**args):
    envs=[]
    for k in range(n_envs):
        e = create_gym_env(args)
        e = TimeLimit(e, max_episode_steps=max_episode_steps)
        envs.append(e)
    return GymEnv(envs, seed)