Esempio n. 1
0
    def create_for_env(
        cls,
        env: EnvWrapper,
        policy: Optional[Policy],
        *,
        device: Union[str, torch.device] = "cpu",
        obs_preprocessor=None,
        action_extractor=None,
        **kwargs,
    ):
        """
        If `policy` is not given, we will try to create a random policy
        """
        if isinstance(device, str):
            device = torch.device(device)

        if obs_preprocessor is None:
            obs_preprocessor = env.get_obs_preprocessor(device=device)

        if action_extractor is None:
            action_extractor = env.get_action_extractor()

        if policy is None:
            policy = make_random_policy_for_env(env)

        return cls(
            policy,
            obs_preprocessor=obs_preprocessor,
            action_extractor=action_extractor,
            **kwargs,
        )
Esempio n. 2
0
    def create_for_env(
        cls,
        env: EnvWrapper,
        policy: Policy,
        *,
        device: Union[str, torch.device] = "cpu",
        obs_preprocessor=None,
        action_extractor=None,
        **kwargs,
    ):
        if isinstance(device, str):
            device = torch.device(device)

        if obs_preprocessor is None:
            obs_preprocessor = env.get_obs_preprocessor(device=device)

        if action_extractor is None:
            action_extractor = env.get_action_extractor()

        return cls(
            policy,
            obs_preprocessor=obs_preprocessor,
            action_extractor=action_extractor,
            **kwargs,
        )
def _create_replay_buffer_and_insert(env: EnvWrapper):
    env.seed(1)
    replay_buffer = ReplayBuffer(replay_capacity=6, batch_size=1)
    replay_buffer_inserter = make_replay_buffer_inserter(env)
    obs = env.reset()
    inserted = []
    terminal = False
    i = 0
    while not terminal and i < 5:
        logger.info(f"Iteration: {i}")
        action = env.action_space.sample()
        next_obs, reward, terminal, _ = env.step(action)
        inserted.append({
            "observation": obs,
            "action": action,
            "reward": reward,
            "terminal": terminal,
        })
        transition = Transition(
            mdp_id=0,
            sequence_number=i,
            observation=obs,
            action=action,
            reward=reward,
            terminal=terminal,
            log_prob=0.0,
        )
        replay_buffer_inserter(replay_buffer, transition)
        obs = next_obs
        i += 1

    return replay_buffer, inserted
Esempio n. 4
0
def run_test_episode_buffer(
    env: EnvWrapper,
    policy: Policy,
    trainer: Trainer,
    num_train_episodes: int,
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool = False,
):
    training_policy = policy

    post_episode_callback = train_post_episode(env, trainer, use_gpu)

    # pyre-fixme[16]: `EnvWrapper` has no attribute `seed`.
    env.seed(SEED)
    # pyre-fixme[16]: `EnvWrapper` has no attribute `action_space`.
    env.action_space.seed(SEED)

    train_rewards = train_policy(
        env,
        training_policy,
        num_train_episodes,
        post_step=None,
        post_episode=post_episode_callback,
        use_gpu=use_gpu,
    )

    # Check whether the max score passed the score bar; we explore during training
    # the return could be bad (leading to flakiness in C51 and QRDQN).
    assert np.max(train_rewards) >= passing_score_bar, (
        f"max reward ({np.max(train_rewards)}) after training for "
        f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n")

    serving_policy = policy
    eval_rewards = eval_policy(env,
                               serving_policy,
                               num_eval_episodes,
                               serving=False)
    assert (
        eval_rewards.mean() >= passing_score_bar
    ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
Esempio n. 5
0
def run_episode(env: EnvWrapper,
                agent: Agent,
                mdp_id: int = 0,
                max_steps: Optional[int] = None) -> Trajectory:
    """
    Return sum of rewards from episode.
    After max_steps (if specified), the environment is assumed to be terminal.
    Can also specify the mdp_id and gamma of episode.
    """
    trajectory = Trajectory()
    obs = env.reset()
    possible_actions_mask = env.possible_actions_mask
    terminal = False
    num_steps = 0
    while not terminal:
        action, log_prob = agent.act(obs, possible_actions_mask)
        next_obs, reward, terminal, _ = env.step(action)
        next_possible_actions_mask = env.possible_actions_mask
        if max_steps is not None and num_steps >= max_steps:
            terminal = True

        # Only partially filled. Agent can fill in more fields.
        transition = Transition(
            mdp_id=mdp_id,
            sequence_number=num_steps,
            observation=obs,
            action=action,
            reward=float(reward),
            terminal=bool(terminal),
            log_prob=log_prob,
            possible_actions_mask=possible_actions_mask,
        )
        agent.post_step(transition)
        trajectory.add_transition(transition)
        SummaryWriterContext.increase_global_step()
        obs = next_obs
        possible_actions_mask = next_possible_actions_mask
        num_steps += 1
    agent.post_episode(trajectory)
    return trajectory
Esempio n. 6
0
    def create_for_env_with_serving_policy(
        cls,
        env: EnvWrapper,
        serving_policy: Policy,
        *,
        obs_preprocessor=None,
        action_extractor=None,
        **kwargs,
    ):
        # device shouldn't be provided as serving is CPU only
        if obs_preprocessor is None:
            obs_preprocessor = env.get_serving_obs_preprocessor()

        if action_extractor is None:
            action_extractor = env.get_serving_action_extractor()

        return cls(
            serving_policy,
            obs_preprocessor=obs_preprocessor,
            action_extractor=action_extractor,
            **kwargs,
        )