Esempio n. 1
0
def evaluate_gym(
    env_name: str,
    model: ModelManager__Union,
    publisher: ModelPublisher__Union,
    num_eval_episodes: int,
    passing_score_bar: float,
    max_steps: Optional[int] = None,
):
    publisher_manager = publisher.value
    assert isinstance(
        publisher_manager, FileSystemPublisher
    ), f"publishing manager is type {type(publisher_manager)}, not FileSystemPublisher"
    env = Gym(env_name=env_name)
    torchscript_path = publisher_manager.get_latest_published_model(
        model.value)
    jit_model = torch.jit.load(torchscript_path)
    policy = create_predictor_policy_from_model(jit_model)
    agent = Agent.create_for_env_with_serving_policy(env, policy)
    rewards = evaluate_for_n_episodes(n=num_eval_episodes,
                                      env=env,
                                      agent=agent,
                                      max_steps=max_steps)
    avg_reward = np.mean(rewards)
    logger.info(f"Average reward over {num_eval_episodes} is {avg_reward}.\n"
                f"List of rewards: {rewards}")
    assert (avg_reward >= passing_score_bar
            ), f"{avg_reward} fails to pass the bar of {passing_score_bar}!"
    return
Esempio n. 2
0
def eval_policy(
    env: EnvWrapper,
    serving_policy: Policy,
    num_eval_episodes: int,
    serving: bool = True,
) -> np.ndarray:
    agent = (
        Agent.create_for_env_with_serving_policy(env, serving_policy)
        if serving
        else Agent.create_for_env(env, serving_policy)
    )

    eval_rewards = evaluate_for_n_episodes(
        n=num_eval_episodes,
        env=env,
        agent=agent,
        max_steps=env.max_steps,
        num_processes=1,
    ).squeeze(1)

    logger.info("============Eval rewards==============")
    logger.info(eval_rewards)
    mean_eval = np.mean(eval_rewards)
    logger.info(f"average: {mean_eval};\tmax: {np.max(eval_rewards)}")
    return np.array(eval_rewards)
Esempio n. 3
0
def make_agent_from_model(
    env: Gym,
    model: ModelManager__Union,
    publisher: ModelPublisher__Union,
    module_name: str,
):
    publisher_manager = publisher.value
    assert isinstance(
        publisher_manager, FileSystemPublisher
    ), f"publishing manager is type {type(publisher_manager)}, not FileSystemPublisher"
    module_names = model.value.serving_module_names()
    assert module_name in module_names, f"{module_name} not in {module_names}"
    torchscript_path = publisher_manager.get_latest_published_model(
        model.value, module_name)
    jit_model = torch.jit.load(torchscript_path)
    policy = create_predictor_policy_from_model(jit_model)
    agent = Agent.create_for_env_with_serving_policy(env, policy)
    return agent
Esempio n. 4
0
def run_test(
    env_name: str,
    model: ModelManager__Union,
    replay_memory_size: int,
    train_every_ts: int,
    train_after_ts: int,
    num_train_episodes: int,
    max_steps: Optional[int],
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool,
):
    env = EnvFactory.make(env_name)
    env.seed(SEED)
    env.action_space.seed(SEED)
    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    try:
        # pyre-fixme[16]: `Env` has no attribute `state_feature_config_provider`.
        manager.state_feature_config_provider = env.state_feature_config_provider
        logger.info(
            f"Using environment's state_feature_config_provider.\n"
            f"{manager.state_feature_config_provider}"
        )
    except AttributeError:
        logger.info("state_feature_config_provider override not applicable")

    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )
    training_policy = manager.create_policy(serving=False)

    replay_buffer = ReplayBuffer.create_from_env(
        env=env,
        replay_memory_size=replay_memory_size,
        batch_size=trainer.minibatch_size,
    )

    device = torch.device("cuda") if use_gpu else torch.device("cpu")
    # first fill the replay buffer to burn_in
    train_after_ts = max(train_after_ts, trainer.minibatch_size)
    fill_replay_buffer(
        env=env, replay_buffer=replay_buffer, desired_size=train_after_ts
    )

    post_step = train_with_replay_buffer_post_step(
        replay_buffer=replay_buffer,
        env=env,
        trainer=trainer,
        training_freq=train_every_ts,
        batch_size=trainer.minibatch_size,
        device=device,
    )

    agent = Agent.create_for_env(
        env, policy=training_policy, post_transition_callback=post_step, device=device
    )

    writer = SummaryWriter()
    with summary_writer_context(writer):
        train_rewards = []
        for i in range(num_train_episodes):
            trajectory = run_episode(
                env=env, agent=agent, mdp_id=i, max_steps=max_steps
            )
            ep_reward = trajectory.calculate_cumulative_reward()
            train_rewards.append(ep_reward)
            logger.info(
                f"Finished training episode {i} (len {len(trajectory)})"
                f" with reward {ep_reward}."
            )

    logger.info("============Train rewards=============")
    logger.info(train_rewards)
    logger.info(f"average: {np.mean(train_rewards)};\tmax: {np.max(train_rewards)}")

    # Check whether the max score passed the score bar; we explore during training
    # the return could be bad (leading to flakiness in C51 and QRDQN).
    assert np.max(train_rewards) >= passing_score_bar, (
        f"max reward ({np.max(train_rewards)})after training for "
        f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n"
    )

    serving_policy = manager.create_policy(serving=True)
    agent = Agent.create_for_env_with_serving_policy(env, serving_policy)

    eval_rewards = evaluate_for_n_episodes(
        n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps
    ).squeeze(1)

    logger.info("============Eval rewards==============")
    logger.info(eval_rewards)
    logger.info(f"average: {np.mean(eval_rewards)};\tmax: {np.max(eval_rewards)}")
    assert np.mean(eval_rewards) >= passing_score_bar, (
        f"Predictor reward is {np.mean(eval_rewards)},"
        f"less than < {passing_score_bar}.\n"
    )
Esempio n. 5
0
def run_test(
    env_name: str,
    model: ModelManager__Union,
    replay_memory_size: int,
    train_every_ts: int,
    train_after_ts: int,
    num_train_episodes: int,
    max_steps: Optional[int],
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool,
):
    env = EnvFactory.make(env_name)
    env.seed(SEED)
    env.action_space.seed(SEED)
    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )
    training_policy = manager.create_policy(serving=False)

    replay_buffer = ReplayBuffer.create_from_env(
        env=env,
        replay_memory_size=replay_memory_size,
        batch_size=trainer.minibatch_size,
    )

    device = torch.device("cuda") if use_gpu else None
    # first fill the replay buffer to burn_in
    train_after_ts = max(train_after_ts, trainer.minibatch_size)
    fill_replay_buffer(env=env,
                       replay_buffer=replay_buffer,
                       desired_size=train_after_ts)

    post_step = train_with_replay_buffer_post_step(
        replay_buffer=replay_buffer,
        env=env,
        trainer=trainer,
        training_freq=train_every_ts,
        batch_size=trainer.minibatch_size,
        device=device,
    )

    agent = Agent.create_for_env(
        env,
        policy=training_policy,
        post_transition_callback=post_step,
        # pyre-fixme[6]: Expected `Union[str, torch.device]` for 4th param but got
        #  `Optional[torch.device]`.
        device=device,
    )

    writer = SummaryWriter()
    with summary_writer_context(writer):
        train_rewards = []
        for i in range(num_train_episodes):
            trajectory = run_episode(env=env,
                                     agent=agent,
                                     mdp_id=i,
                                     max_steps=max_steps)
            ep_reward = trajectory.calculate_cumulative_reward()
            train_rewards.append(ep_reward)
            logger.info(
                f"Finished training episode {i} with reward {ep_reward}.")

    assert train_rewards[-1] >= passing_score_bar, (
        f"reward after {len(train_rewards)} episodes is {train_rewards[-1]},"
        f"less than < {passing_score_bar}...\n"
        f"Full reward history: {train_rewards}")

    logger.info("============Train rewards=============")
    logger.info(train_rewards)

    serving_policy = manager.create_policy(serving=True)
    agent = Agent.create_for_env_with_serving_policy(env, serving_policy)

    eval_rewards = evaluate_for_n_episodes(n=num_eval_episodes,
                                           env=env,
                                           agent=agent,
                                           max_steps=max_steps).squeeze(1)
    assert np.mean(eval_rewards) >= passing_score_bar, (
        f"Predictor reward is {np.mean(eval_rewards)},"
        f"less than < {passing_score_bar}...\n"
        f"Full eval rewards: {eval_rewards}.")

    logger.info("============Eval rewards==============")
    logger.info(eval_rewards)