Beispiel #1
0
def evaluate_cem(env, manager, num_eval_episodes: int):
    # NOTE: for CEM, serving isn't implemented
    policy = manager.create_policy(serving=False)
    agent = Agent.create_for_env(env, policy)
    return evaluate_for_n_episodes(
        n=num_eval_episodes, env=env, agent=agent, max_steps=env.max_steps
    )
Beispiel #2
0
def eval_policy(
    env: EnvWrapper,
    serving_policy: Policy,
    num_eval_episodes: int,
    serving: bool = True,
) -> np.ndarray:
    agent = (
        Agent.create_for_env_with_serving_policy(env, serving_policy)
        if serving
        else Agent.create_for_env(env, serving_policy)
    )

    eval_rewards = evaluate_for_n_episodes(
        n=num_eval_episodes,
        env=env,
        agent=agent,
        max_steps=env.max_steps,
        num_processes=1,
    ).squeeze(1)

    logger.info("============Eval rewards==============")
    logger.info(eval_rewards)
    mean_eval = np.mean(eval_rewards)
    logger.info(f"average: {mean_eval};\tmax: {np.max(eval_rewards)}")
    return np.array(eval_rewards)
Beispiel #3
0
def offline_gym(
    env: str,
    pkl_path: str,
    num_episodes_for_data_batch: int,
    max_steps: Optional[int],
    seed: Optional[int] = None,
):
    """
    Generate samples from a DiscreteRandomPolicy on the Gym environment and
    saves results in a pandas df parquet.
    """
    initialize_seed(seed)
    env = EnvFactory.make(env)

    policy = DiscreteRandomPolicy.create_for_env(env)
    dataset = RLDataset()
    for i in range(num_episodes_for_data_batch):
        logger.info(f"Starting episode {i}")
        post_step = log_data_post_step(dataset=dataset, mdp_id=str(i), env=env)
        agent = Agent.create_for_env(env,
                                     policy,
                                     post_transition_callback=post_step)
        run_episode(env=env, agent=agent, max_steps=max_steps)

    logger.info(f"Saving dataset with {len(dataset)} samples to {pkl_path}")
    df = dataset.to_pandas_df()
    df.to_pickle(pkl_path)
Beispiel #4
0
def train_policy(
    env: EnvWrapper,
    training_policy: Policy,
    num_train_episodes: int,
    post_step: Optional[PostStep] = None,
    post_episode: Optional[PostEpisode] = None,
    use_gpu: bool = False,
) -> np.ndarray:
    device = torch.device("cuda") if use_gpu else torch.device("cpu")
    agent = Agent.create_for_env(
        env,
        policy=training_policy,
        post_transition_callback=post_step,
        post_episode_callback=post_episode,
        device=device,
    )
    running_reward = 0
    writer = SummaryWriter()
    with summary_writer_context(writer):
        train_rewards = []
        with trange(num_train_episodes, unit=" epoch") as t:
            for i in t:
                # Note: run_episode also performs a training step for the agent, if specified in post_step
                trajectory = run_episode(env=env, agent=agent, mdp_id=i, max_steps=200)
                ep_reward = trajectory.calculate_cumulative_reward()
                train_rewards.append(ep_reward)
                running_reward *= REWARD_DECAY
                running_reward += (1 - REWARD_DECAY) * ep_reward
                t.set_postfix(reward=running_reward)

    logger.info("============Train rewards=============")
    logger.info(train_rewards)
    logger.info(f"average: {np.mean(train_rewards)};\tmax: {np.max(train_rewards)}")
    return np.array(train_rewards)
Beispiel #5
0
def evaluate_gym(
    env_name: str,
    model: torch.nn.Module,
    eval_temperature: float,
    num_eval_episodes: int,
    passing_score_bar: float,
    max_steps: Optional[int] = None,
):
    env: gym.Env = EnvFactory.make(env_name)
    policy = create_predictor_policy_from_model(
        env, model, eval_temperature=eval_temperature)

    # since we already return softmax action, override action_extractor
    agent = Agent.create_for_env(
        env, policy=policy, action_extractor=policy.get_action_extractor())

    rewards = []
    for _ in range(num_eval_episodes):
        ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps)
        rewards.append(ep_reward)

    avg_reward = np.mean(rewards)
    logger.info(f"Average reward over {num_eval_episodes} is {avg_reward}.\n"
                f"List of rewards: {rewards}")
    assert (avg_reward >= passing_score_bar
            ), f"{avg_reward} fails to pass the bar of {passing_score_bar}!"
Beispiel #6
0
def run_test_offline(
    env_name: str,
    model: ModelManager__Union,
    replay_memory_size: int,
    num_batches_per_epoch: int,
    num_train_epochs: int,
    passing_score_bar: float,
    num_eval_episodes: int,
    minibatch_size: int,
    use_gpu: bool,
):
    env = Gym(env_name=env_name)
    env.seed(SEED)
    env.action_space.seed(SEED)
    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )

    # first fill the replay buffer to burn_in
    replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size,
                                 batch_size=minibatch_size)
    # always fill full RB
    random_policy = make_random_policy_for_env(env)
    agent = Agent.create_for_env(env, policy=random_policy)
    fill_replay_buffer(
        env=env,
        replay_buffer=replay_buffer,
        desired_size=replay_memory_size,
        agent=agent,
    )

    device = torch.device("cuda") if use_gpu else None
    # pyre-fixme[6]: Expected `device` for 2nd param but got `Optional[torch.device]`.
    trainer_preprocessor = make_replay_buffer_trainer_preprocessor(
        trainer, device, env)

    writer = SummaryWriter()
    with summary_writer_context(writer):
        for epoch in range(num_train_epochs):
            logger.info(f"Evaluating before epoch {epoch}: ")
            eval_rewards = evaluate_cem(env, manager, 1)
            for _ in tqdm(range(num_batches_per_epoch)):
                train_batch = replay_buffer.sample_transition_batch()
                preprocessed_batch = trainer_preprocessor(train_batch)
                trainer.train(preprocessed_batch)

    logger.info(f"Evaluating after training for {num_train_epochs} epochs: ")
    eval_rewards = evaluate_cem(env, manager, num_eval_episodes)
    mean_rewards = np.mean(eval_rewards)
    assert (mean_rewards >= passing_score_bar
            ), f"{mean_rewards} doesn't pass the bar {passing_score_bar}."
Beispiel #7
0
def run_test_online_episode(
    env: Env__Union,
    model: ModelManager__Union,
    num_train_episodes: int,
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool,
):
    """
    Run an online learning test. At the end of each episode training is run on the trajectory.
    """
    env = env.value
    pl.seed_everything(SEED)
    env.seed(SEED)
    env.action_space.seed(SEED)

    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    trainer = manager.build_trainer(
        use_gpu=use_gpu,
        normalization_data_map=normalization,
    )
    policy = manager.create_policy(trainer, serving=False)

    device = torch.device("cuda") if use_gpu else torch.device("cpu")

    agent = Agent.create_for_env(env, policy, device=device)

    pl_trainer = pl.Trainer(
        max_epochs=1,
        gpus=int(use_gpu),
        deterministic=True,
        default_root_dir=f"lightning_log_{str(uuid.uuid4())}",
    )
    dataset = EpisodicDataset(env=env,
                              agent=agent,
                              num_episodes=num_train_episodes,
                              seed=SEED)
    data_loader = torch.utils.data.DataLoader(dataset,
                                              collate_fn=identity_collate)
    pl_trainer.fit(trainer, data_loader)

    eval_rewards = evaluate_for_n_episodes(
        n=num_eval_episodes,
        env=env,
        agent=agent,
        max_steps=env.max_steps,
        num_processes=1,
    ).squeeze(1)
    assert (
        eval_rewards.mean() >= passing_score_bar
    ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
Beispiel #8
0
def fill_replay_buffer(env, replay_buffer: ReplayBuffer, desired_size: int):
    """ Fill replay buffer with random transitions until size reaches desired_size. """
    assert (
        0 < desired_size and desired_size <= replay_buffer._replay_capacity
    ), f"It's not true that 0 < {desired_size} <= {replay_buffer._replay_capacity}."
    assert replay_buffer.size < desired_size, (
        f"Replay buffer already has {replay_buffer.size} elements. "
        f"(more than desired_size = {desired_size})")
    logger.info(
        f" Starting to fill replay buffer using random policy to size: {desired_size}."
    )
    random_policy = make_random_policy_for_env(env)
    post_step = add_replay_buffer_post_step(replay_buffer, env=env)

    agent = Agent.create_for_env(env,
                                 policy=random_policy,
                                 post_transition_callback=post_step)
    max_episode_steps = env.max_steps
    with tqdm(
            total=desired_size - replay_buffer.size,
            desc=
            f"Filling replay buffer from {replay_buffer.size} to size {desired_size} using random policy",
    ) as pbar:
        mdp_id = 0
        while replay_buffer.size < desired_size:
            last_size = replay_buffer.size
            max_steps = desired_size - replay_buffer.size - 1
            if max_episode_steps is not None:
                max_steps = min(max_episode_steps, max_steps)
            run_episode(env=env,
                        agent=agent,
                        mdp_id=mdp_id,
                        max_steps=max_steps)
            size_delta = replay_buffer.size - last_size
            # The assertion below is commented out because it can't
            # support input samples which has seq_len>1. This should be
            # treated as a bug, and need to be fixed in the future.
            # assert (
            #     size_delta >= 0
            # ), f"size delta is {size_delta} which should be non-negative."
            pbar.update(n=size_delta)
            mdp_id += 1
            if size_delta <= 0:
                # replay buffer size isn't increasing... so stop early
                break

    if replay_buffer.size >= desired_size:
        logger.info(
            f"Successfully filled replay buffer to size: {replay_buffer.size}!"
        )
    else:
        logger.info(
            f"Stopped early and filled replay buffer to size: {replay_buffer.size}."
        )
Beispiel #9
0
def run_test(
    env: str,
    model: ModelManager__Union,
    replay_memory_size: int,
    train_every_ts: int,
    train_after_ts: int,
    num_episodes: int,
    max_steps: Optional[int],
    last_score_bar: float,
):
    env = EnvFactory.make(env)
    env.seed(SEED)
    env.action_space.seed(SEED)
    normalization = build_normalizer(env)
    logger.info(f"Normalization is {normalization}")

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=False,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )

    policy = manager.create_policy()
    replay_buffer = ReplayBuffer.create_from_env(
        env=env,
        replay_memory_size=replay_memory_size,
        batch_size=trainer.minibatch_size,
    )

    post_step = train_with_replay_buffer_post_step(
        replay_buffer=replay_buffer,
        trainer=trainer,
        training_freq=train_every_ts,
        batch_size=trainer.minibatch_size,
        replay_burnin=train_after_ts,
    )

    agent = Agent.create_for_env(env,
                                 policy=policy,
                                 post_transition_callback=post_step)

    reward_history = []
    for i in range(num_episodes):
        logger.info(f"running episode {i}")
        ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps)
        reward_history.append(ep_reward)

    assert reward_history[-1] >= last_score_bar, (
        f"reward after {len(reward_history)} episodes is {reward_history[-1]},"
        f"less than < {last_score_bar}...\n"
        f"Full reward history: {reward_history}")

    return reward_history
Beispiel #10
0
def fill_replay_buffer(env: Env, replay_buffer: ReplayBuffer,
                       desired_size: int):
    """ Fill replay buffer with random transitions until size reaches desired_size. """
    assert (
        0 < desired_size and desired_size <= replay_buffer._replay_capacity
    ), f"It's not true that 0 < {desired_size} <= {replay_buffer._replay_capacity}."
    assert replay_buffer.size < desired_size, (
        f"Replay buffer already has {replay_buffer.size} elements. "
        f"(more than desired_size = {desired_size})")
    logger.info(f"Starting to fill replay buffer to size: {desired_size}.")
    random_policy = make_random_policy_for_env(env)
    post_step = add_replay_buffer_post_step(replay_buffer, env=env)
    agent = Agent.create_for_env(env,
                                 policy=random_policy,
                                 post_transition_callback=post_step)
    max_episode_steps = get_max_steps(env)
    with tqdm(
            total=desired_size - replay_buffer.size,
            desc=
            f"Filling replay buffer from {replay_buffer.size} to size {desired_size}",
    ) as pbar:
        mdp_id = 0
        while replay_buffer.size < desired_size:
            last_size = replay_buffer.size
            max_steps = desired_size - replay_buffer.size - 1
            if max_episode_steps is not None:
                max_steps = min(max_episode_steps, max_steps)
            run_episode(env=env,
                        agent=agent,
                        mdp_id=mdp_id,
                        max_steps=max_steps)
            size_delta = replay_buffer.size - last_size
            assert (
                size_delta >=
                0), f"size delta is {size_delta} which should be non-negative."
            pbar.update(n=size_delta)
            mdp_id += 1
            if size_delta == 0:
                # replay buffer size isn't increasing... so stop early
                break

    if replay_buffer.size >= desired_size:
        logger.info(
            f"Successfully filled replay buffer to size: {replay_buffer.size}!"
        )
    else:
        logger.info(
            f"Stopped early and filled replay buffer to size: {replay_buffer.size}."
        )
Beispiel #11
0
def offline_gym_random(
    env_name: str,
    pkl_path: str,
    num_train_transitions: int,
    max_steps: Optional[int],
    seed: int = 1,
):
    """
    Generate samples from a random Policy on the Gym environment and
    saves results in a pandas df parquet.
    """
    env = Gym(env_name=env_name)
    random_policy = make_random_policy_for_env(env)
    agent = Agent.create_for_env(env, policy=random_policy)
    return _offline_gym(env, agent, pkl_path, num_train_transitions, max_steps,
                        seed)
def train_mdnrnn(
    env: EnvWrapper,
    trainer: MDNRNNTrainer,
    trainer_preprocessor,
    num_train_transitions: int,
    seq_len: int,
    batch_size: int,
    num_train_epochs: int,
    # for optional validation
    test_replay_buffer=None,
):
    train_replay_buffer = ReplayBuffer(
        replay_capacity=num_train_transitions,
        batch_size=batch_size,
        stack_size=seq_len,
        return_everything_as_stack=True,
    )
    random_policy = make_random_policy_for_env(env)
    agent = Agent.create_for_env(env, policy=random_policy)
    fill_replay_buffer(env, train_replay_buffer, num_train_transitions, agent)
    num_batch_per_epoch = train_replay_buffer.size // batch_size

    logger.info("Made RBs, starting to train now!")
    optimizer = trainer.configure_optimizers()[0]
    for _ in range(num_train_epochs):
        for i in range(num_batch_per_epoch):
            batch = train_replay_buffer.sample_transition_batch(batch_size=batch_size)
            preprocessed_batch = trainer_preprocessor(batch)
            loss = next(trainer.train_step_gen(preprocessed_batch, i))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # validation
        if test_replay_buffer is not None:
            with torch.no_grad():
                trainer.memory_network.mdnrnn.eval()
                test_batch = test_replay_buffer.sample_transition_batch(
                    batch_size=batch_size
                )
                preprocessed_test_batch = trainer_preprocessor(test_batch)
                valid_losses = trainer.get_loss(preprocessed_test_batch)
                trainer.memory_network.mdnrnn.train()
    return trainer
def create_embed_rl_dataset(
    env: EnvWrapper,
    memory_network: MemoryNetwork,
    num_state_embed_transitions: int,
    batch_size: int,
    seq_len: int,
    hidden_dim: int,
    use_gpu: bool,
):
    assert isinstance(env.action_space, gym.spaces.Discrete)
    assert isinstance(env.observation_space, gym.spaces.Box)
    assert len(env.observation_space.shape) == 1
    logger.info("Starting to create embedded RL Dataset!")

    # seqlen+1 because MDNRNN embeds the first seq_len steps and then
    # the embedded state will be concatenated with the last step
    # Ie.. (o1,o2,...,on) -> RNN -> h1,h2,...,hn
    # and we set s_{n+1} = [o_{n+1}, h_n]
    embed_env = StateEmbedEnvironment(
        gym_env=env, mdnrnn=memory_network, max_embed_seq_len=seq_len + 1
    )
    # now create a filled replay buffer of embeddings
    # new obs shape dim = state_dim + hidden_dim
    embed_rb = ReplayBuffer(
        replay_capacity=num_state_embed_transitions, batch_size=batch_size, stack_size=1
    )
    random_policy = make_random_policy_for_env(env)
    agent = Agent.create_for_env(env, policy=random_policy)
    fill_replay_buffer(
        env=embed_env,
        replay_buffer=embed_rb,
        desired_size=num_state_embed_transitions,
        agent=agent,
    )
    batch = embed_rb.sample_transition_batch(batch_size=num_state_embed_transitions)
    state_min = min(batch.state.min(), batch.next_state.min()).item()
    state_max = max(batch.state.max(), batch.next_state.max()).item()
    logger.info(
        f"Finished making embed dataset with size {embed_rb.size}, "
        f"min {state_min}, max {state_max}"
    )
    return embed_rb, state_min, state_max
Beispiel #14
0
 def setUp(self):
     logging.getLogger().setLevel(logging.DEBUG)
     env = Gym("CartPole-v0")
     norm = build_normalizer(env)
     net_builder = FullyConnected(sizes=[8], activations=["linear"])
     cartpole_scorer = net_builder.build_q_network(
         state_feature_config=None,
         state_normalization_data=norm["state"],
         output_dim=len(norm["action"].dense_normalization_parameters),
     )
     policy = Policy(scorer=cartpole_scorer, sampler=SoftmaxActionSampler())
     agent = Agent.create_for_env(env, policy)
     self.max_steps = 3
     self.num_episodes = 6
     self.dataset = EpisodicDataset(
         env=env,
         agent=agent,
         num_episodes=self.num_episodes,
         seed=0,
         max_steps=self.max_steps,
     )
Beispiel #15
0
def train_mdnrnn_and_train_on_embedded_env(
    env_name: str,
    embedding_model: ModelManager__Union,
    num_embedding_train_transitions: int,
    seq_len: int,
    batch_size: int,
    num_embedding_train_epochs: int,
    train_model: ModelManager__Union,
    num_state_embed_transitions: int,
    num_agent_train_epochs: int,
    num_agent_eval_epochs: int,
    use_gpu: bool,
    passing_score_bar: float,
    # pyre-fixme[9]: saved_mdnrnn_path has type `str`; used as `None`.
    saved_mdnrnn_path: str = None,
):
    """ Train an agent on embedded states by the MDNRNN. """
    env = EnvFactory.make(env_name)
    env.seed(SEED)

    embedding_manager = embedding_model.value
    embedding_trainer = embedding_manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=build_normalizer(env),
    )

    device = "cuda" if use_gpu else "cpu"
    embedding_trainer_preprocessor = make_replay_buffer_trainer_preprocessor(
        embedding_trainer, device, env
    )
    if saved_mdnrnn_path is None:
        # train from scratch
        embedding_trainer = train_mdnrnn(
            env=env,
            trainer=embedding_trainer,
            trainer_preprocessor=embedding_trainer_preprocessor,
            num_train_transitions=num_embedding_train_transitions,
            seq_len=seq_len,
            batch_size=batch_size,
            num_train_epochs=num_embedding_train_epochs,
        )
    else:
        # load a pretrained model, and just evaluate it
        embedding_trainer.memory_network.mdnrnn.load_state_dict(
            torch.load(saved_mdnrnn_path)
        )

    # create embedding dataset
    embed_rb, state_min, state_max = create_embed_rl_dataset(
        env=env,
        memory_network=embedding_trainer.memory_network,
        num_state_embed_transitions=num_state_embed_transitions,
        batch_size=batch_size,
        seq_len=seq_len,
        hidden_dim=embedding_trainer.params.hidden_size,
        use_gpu=use_gpu,
    )
    embed_env = StateEmbedEnvironment(
        gym_env=env,
        mdnrnn=embedding_trainer.memory_network,
        max_embed_seq_len=seq_len,
        state_min_value=state_min,
        state_max_value=state_max,
    )
    agent_manager = train_model.value
    agent_trainer = agent_manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=build_normalizer(embed_env),
    )
    device = "cuda" if use_gpu else "cpu"
    agent_trainer_preprocessor = make_replay_buffer_trainer_preprocessor(
        agent_trainer, device, env
    )
    num_batch_per_epoch = embed_rb.size // batch_size
    for epoch in range(num_agent_train_epochs):
        for _ in tqdm(range(num_batch_per_epoch), desc=f"epoch {epoch}"):
            batch = embed_rb.sample_transition_batch_tensor(batch_size=batch_size)
            preprocessed_batch = agent_trainer_preprocessor(batch)
            agent_trainer.train(preprocessed_batch)

    # evaluate model
    rewards = []
    policy = agent_manager.create_policy(serving=False)
    agent = Agent.create_for_env(embed_env, policy=policy, device=device)
    for i in range(num_agent_eval_epochs):
        ep_reward = run_episode(env=embed_env, agent=agent)
        rewards.append(ep_reward)
        logger.info(f"Finished eval episode {i} with reward {ep_reward}.")
    logger.info(f"Average eval reward is {np.mean(rewards)}.")
    assert (
        np.mean(rewards) >= passing_score_bar
    ), f"average reward doesn't pass our bar {passing_score_bar}"
    return rewards
Beispiel #16
0
def run_test(
    env_name: str,
    model: ModelManager__Union,
    replay_memory_size: int,
    train_every_ts: int,
    train_after_ts: int,
    num_train_episodes: int,
    max_steps: Optional[int],
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool,
):
    env = EnvFactory.make(env_name)
    env.seed(SEED)
    env.action_space.seed(SEED)
    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )
    training_policy = manager.create_policy(serving=False)

    replay_buffer = ReplayBuffer.create_from_env(
        env=env,
        replay_memory_size=replay_memory_size,
        batch_size=trainer.minibatch_size,
    )

    device = torch.device("cuda") if use_gpu else None
    # first fill the replay buffer to burn_in
    train_after_ts = max(train_after_ts, trainer.minibatch_size)
    fill_replay_buffer(env=env,
                       replay_buffer=replay_buffer,
                       desired_size=train_after_ts)

    post_step = train_with_replay_buffer_post_step(
        replay_buffer=replay_buffer,
        env=env,
        trainer=trainer,
        training_freq=train_every_ts,
        batch_size=trainer.minibatch_size,
        device=device,
    )

    agent = Agent.create_for_env(
        env,
        policy=training_policy,
        post_transition_callback=post_step,
        # pyre-fixme[6]: Expected `Union[str, torch.device]` for 4th param but got
        #  `Optional[torch.device]`.
        device=device,
    )

    writer = SummaryWriter()
    with summary_writer_context(writer):
        train_rewards = []
        for i in range(num_train_episodes):
            trajectory = run_episode(env=env,
                                     agent=agent,
                                     mdp_id=i,
                                     max_steps=max_steps)
            ep_reward = trajectory.calculate_cumulative_reward()
            train_rewards.append(ep_reward)
            logger.info(
                f"Finished training episode {i} with reward {ep_reward}.")

    assert train_rewards[-1] >= passing_score_bar, (
        f"reward after {len(train_rewards)} episodes is {train_rewards[-1]},"
        f"less than < {passing_score_bar}...\n"
        f"Full reward history: {train_rewards}")

    logger.info("============Train rewards=============")
    logger.info(train_rewards)

    serving_policy = manager.create_policy(serving=True)
    agent = Agent.create_for_env_with_serving_policy(env, serving_policy)

    eval_rewards = evaluate_for_n_episodes(n=num_eval_episodes,
                                           env=env,
                                           agent=agent,
                                           max_steps=max_steps).squeeze(1)
    assert np.mean(eval_rewards) >= passing_score_bar, (
        f"Predictor reward is {np.mean(eval_rewards)},"
        f"less than < {passing_score_bar}...\n"
        f"Full eval rewards: {eval_rewards}.")

    logger.info("============Eval rewards==============")
    logger.info(eval_rewards)
Beispiel #17
0
def train_mdnrnn_and_train_on_embedded_env(
    env_name: str,
    embedding_model: ModelManager__Union,
    num_embedding_train_transitions: int,
    seq_len: int,
    batch_size: int,
    num_embedding_train_epochs: int,
    train_model: ModelManager__Union,
    num_state_embed_transitions: int,
    num_agent_train_epochs: int,
    num_agent_eval_epochs: int,
    use_gpu: bool,
    passing_score_bar: float,
    # pyre-fixme[9]: saved_mdnrnn_path has type `str`; used as `None`.
    saved_mdnrnn_path: str = None,
):
    """ Train an agent on embedded states by the MDNRNN. """
    env = Gym(env_name=env_name)
    env.seed(SEED)

    embedding_manager = embedding_model.value
    embedding_trainer = embedding_manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=build_normalizer(env),
    )

    device = "cuda" if use_gpu else "cpu"
    embedding_trainer_preprocessor = make_replay_buffer_trainer_preprocessor(
        embedding_trainer,
        # pyre-fixme[6]: Expected `device` for 2nd param but got `str`.
        device,
        env,
    )
    if saved_mdnrnn_path is None:
        # train from scratch
        embedding_trainer = train_mdnrnn(
            env=env,
            trainer=embedding_trainer,
            trainer_preprocessor=embedding_trainer_preprocessor,
            num_train_transitions=num_embedding_train_transitions,
            seq_len=seq_len,
            batch_size=batch_size,
            num_train_epochs=num_embedding_train_epochs,
        )
    else:
        # load a pretrained model, and just evaluate it
        embedding_trainer.memory_network.mdnrnn.load_state_dict(
            torch.load(saved_mdnrnn_path))

    # create embedding dataset
    embed_rb, state_min, state_max = create_embed_rl_dataset(
        env=env,
        memory_network=embedding_trainer.memory_network,
        num_state_embed_transitions=num_state_embed_transitions,
        batch_size=batch_size,
        seq_len=seq_len,
        hidden_dim=embedding_trainer.params.hidden_size,
        use_gpu=use_gpu,
    )
    embed_env = StateEmbedEnvironment(
        gym_env=env,
        mdnrnn=embedding_trainer.memory_network,
        max_embed_seq_len=seq_len,
        state_min_value=state_min,
        state_max_value=state_max,
    )
    agent_manager = train_model.value
    agent_trainer = agent_manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        # pyre-fixme[6]: Expected `EnvWrapper` for 1st param but got
        #  `StateEmbedEnvironment`.
        normalization_data_map=build_normalizer(embed_env),
    )
    device = "cuda" if use_gpu else "cpu"
    agent_trainer_preprocessor = make_replay_buffer_trainer_preprocessor(
        agent_trainer,
        # pyre-fixme[6]: Expected `device` for 2nd param but got `str`.
        device,
        env,
    )
    num_batch_per_epoch = embed_rb.size // batch_size
    # FIXME: This has to be wrapped in dataloader
    for epoch in range(num_agent_train_epochs):
        for _ in tqdm(range(num_batch_per_epoch), desc=f"epoch {epoch}"):
            batch = embed_rb.sample_transition_batch(batch_size=batch_size)
            preprocessed_batch = agent_trainer_preprocessor(batch)
            # FIXME: This should be fitted with Lightning's trainer
            agent_trainer.train(preprocessed_batch)

    # evaluate model
    rewards = []
    policy = agent_manager.create_policy(serving=False)
    # pyre-fixme[6]: Expected `EnvWrapper` for 1st param but got
    #  `StateEmbedEnvironment`.
    agent = Agent.create_for_env(embed_env, policy=policy, device=device)
    # num_processes=1 needed to avoid workers from dying on CircleCI tests
    rewards = evaluate_for_n_episodes(
        n=num_agent_eval_epochs,
        # pyre-fixme[6]: Expected `EnvWrapper` for 2nd param but got
        #  `StateEmbedEnvironment`.
        env=embed_env,
        agent=agent,
        num_processes=1,
    )
    assert (np.mean(rewards) >= passing_score_bar
            ), f"average reward doesn't pass our bar {passing_score_bar}"
    return rewards
Beispiel #18
0
def run_test(
    env: Env__Union,
    model: ModelManager__Union,
    replay_memory_size: int,
    train_every_ts: int,
    train_after_ts: int,
    num_train_episodes: int,
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool,
    minibatch_size: Optional[int] = None,
):
    env = env.value

    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )
    training_policy = manager.create_policy(serving=False)

    # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`.
    if not isinstance(trainer, pl.LightningModule):
        if minibatch_size is None:
            minibatch_size = trainer.minibatch_size
        assert minibatch_size == trainer.minibatch_size

    assert minibatch_size is not None

    replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size,
                                 batch_size=minibatch_size)

    device = torch.device("cuda") if use_gpu else torch.device("cpu")
    # first fill the replay buffer to burn_in
    train_after_ts = max(train_after_ts, minibatch_size)
    fill_replay_buffer(env=env,
                       replay_buffer=replay_buffer,
                       desired_size=train_after_ts)

    # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`.
    if isinstance(trainer, pl.LightningModule):
        agent = Agent.create_for_env(env, policy=training_policy)
        # TODO: Simplify this setup by creating LightningDataModule
        dataset = ReplayBufferDataset.create_for_trainer(
            trainer,
            env,
            agent,
            replay_buffer,
            batch_size=minibatch_size,
            training_frequency=train_every_ts,
            num_episodes=num_train_episodes,
            max_steps=200,
        )
        data_loader = torch.utils.data.DataLoader(dataset,
                                                  collate_fn=identity_collate)
        # pyre-fixme[16]: Module `pl` has no attribute `Trainer`.
        pl_trainer = pl.Trainer(max_epochs=1, gpus=int(use_gpu))
        pl_trainer.fit(trainer, data_loader)

        # TODO: Also check train_reward
    else:
        post_step = train_with_replay_buffer_post_step(
            replay_buffer=replay_buffer,
            env=env,
            trainer=trainer,
            training_freq=train_every_ts,
            batch_size=trainer.minibatch_size,
            device=device,
        )

        env.seed(SEED)
        env.action_space.seed(SEED)

        train_rewards = train_policy(
            env,
            training_policy,
            num_train_episodes,
            post_step=post_step,
            post_episode=None,
            use_gpu=use_gpu,
        )

        # Check whether the max score passed the score bar; we explore during training
        # the return could be bad (leading to flakiness in C51 and QRDQN).
        assert np.max(train_rewards) >= passing_score_bar, (
            f"max reward ({np.max(train_rewards)}) after training for "
            f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n"
        )

    serving_policy = manager.create_policy(serving=True)

    eval_rewards = eval_policy(env,
                               serving_policy,
                               num_eval_episodes,
                               serving=True)
    assert (
        eval_rewards.mean() >= passing_score_bar
    ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
Beispiel #19
0
def create_df_from_replay_buffer(
    env,
    problem_domain: ProblemDomain,
    desired_size: int,
    multi_steps: Optional[int],
    ds: str,
) -> pd.DataFrame:
    # fill the replay buffer
    set_seed(env, SEED)
    if multi_steps is None:
        update_horizon = 1
        return_as_timeline_format = False
    else:
        update_horizon = multi_steps
        return_as_timeline_format = True
    is_multi_steps = multi_steps is not None

    replay_buffer = ReplayBuffer(
        replay_capacity=desired_size,
        batch_size=1,
        update_horizon=update_horizon,
        return_as_timeline_format=return_as_timeline_format,
    )
    random_policy = make_random_policy_for_env(env)
    agent = Agent.create_for_env(env, policy=random_policy)
    fill_replay_buffer(env, replay_buffer, desired_size, agent)

    batch = replay_buffer.sample_all_valid_transitions()
    n = batch.state.shape[0]
    logger.info(f"Creating df of size {n}.")

    def discrete_feat_transform(elem) -> str:
        """query data expects str format"""
        return str(elem.item())

    def continuous_feat_transform(elem: List[float]) -> Dict[int, float]:
        """query data expects sparse format"""
        assert isinstance(elem, torch.Tensor), f"{type(elem)} isn't tensor"
        assert len(elem.shape) == 1, f"{elem.shape} isn't 1-dimensional"
        return {i: s.item() for i, s in enumerate(elem)}

    def make_parametric_feat_transform(one_hot_dim: int):
        """one-hot and then continuous_feat_transform"""

        def transform(elem) -> Dict[int, float]:
            elem_tensor = torch.tensor(elem.item())
            one_hot_feat = F.one_hot(elem_tensor, one_hot_dim).float()
            return continuous_feat_transform(one_hot_feat)

        return transform

    state_features = feature_transform(batch.state, continuous_feat_transform)
    next_state_features = feature_transform(
        batch.next_state,
        continuous_feat_transform,
        is_next_with_multi_steps=is_multi_steps,
    )

    if problem_domain == ProblemDomain.DISCRETE_ACTION:
        # discrete action is str
        action = feature_transform(batch.action, discrete_feat_transform)
        next_action = feature_transform(
            batch.next_action,
            discrete_feat_transform,
            is_next_with_multi_steps=is_multi_steps,
            replace_when_terminal="",
            terminal=batch.terminal,
        )
    elif problem_domain == ProblemDomain.PARAMETRIC_ACTION:
        # continuous action is Dict[int, double]
        assert isinstance(env.action_space, gym.spaces.Discrete)
        parametric_feat_transform = make_parametric_feat_transform(env.action_space.n)
        action = feature_transform(batch.action, parametric_feat_transform)
        next_action = feature_transform(
            batch.next_action,
            parametric_feat_transform,
            is_next_with_multi_steps=is_multi_steps,
            replace_when_terminal={},
            terminal=batch.terminal,
        )
    elif problem_domain == ProblemDomain.CONTINUOUS_ACTION:
        action = feature_transform(batch.action, continuous_feat_transform)
        next_action = feature_transform(
            batch.next_action,
            continuous_feat_transform,
            is_next_with_multi_steps=is_multi_steps,
            replace_when_terminal={},
            terminal=batch.terminal,
        )
    elif problem_domain == ProblemDomain.MDN_RNN:
        action = feature_transform(batch.action, discrete_feat_transform)
        assert multi_steps is not None
        next_action = feature_transform(
            batch.next_action,
            discrete_feat_transform,
            is_next_with_multi_steps=True,
            replace_when_terminal="",
            terminal=batch.terminal,
        )
    else:
        raise NotImplementedError(f"model type: {problem_domain}.")

    if multi_steps is None:
        time_diff = [1] * n
        reward = batch.reward.squeeze(1).tolist()
        metrics = [{"reward": r} for r in reward]
    else:
        time_diff = [[1] * len(ns) for ns in next_state_features]
        reward = [reward_list.tolist() for reward_list in batch.reward]
        metrics = [
            [{"reward": r.item()} for r in reward_list] for reward_list in batch.reward
        ]

    # TODO(T67265031): change this to int
    mdp_id = [str(i.item()) for i in batch.mdp_id]
    sequence_number = batch.sequence_number.squeeze(1).tolist()
    # in the product data, all sequence_number_ordinal start from 1.
    # So to be consistent with the product data.

    sequence_number_ordinal = (batch.sequence_number.squeeze(1) + 1).tolist()
    action_probability = batch.log_prob.exp().squeeze(1).tolist()
    df_dict = {
        "state_features": state_features,
        "next_state_features": next_state_features,
        "action": action,
        "next_action": next_action,
        "reward": reward,
        "action_probability": action_probability,
        "metrics": metrics,
        "time_diff": time_diff,
        "mdp_id": mdp_id,
        "sequence_number": sequence_number,
        "sequence_number_ordinal": sequence_number_ordinal,
        "ds": [ds] * n,
    }

    if problem_domain == ProblemDomain.PARAMETRIC_ACTION:
        # Possible actions are List[Dict[int, float]]
        assert isinstance(env.action_space, gym.spaces.Discrete)
        possible_actions = [{i: 1.0} for i in range(env.action_space.n)]

    elif problem_domain == ProblemDomain.DISCRETE_ACTION:
        # Possible actions are List[str]
        assert isinstance(env.action_space, gym.spaces.Discrete)
        possible_actions = [str(i) for i in range(env.action_space.n)]

    elif problem_domain == ProblemDomain.MDN_RNN:
        # Possible actions are List[str]
        assert isinstance(env.action_space, gym.spaces.Discrete)
        possible_actions = [str(i) for i in range(env.action_space.n)]

    # these are fillers, which should have correct shape
    pa_features = range(n)
    pna_features = time_diff
    if problem_domain in (
        ProblemDomain.DISCRETE_ACTION,
        ProblemDomain.PARAMETRIC_ACTION,
        ProblemDomain.MDN_RNN,
    ):

        def pa_transform(x):
            return possible_actions

        df_dict["possible_actions"] = feature_transform(pa_features, pa_transform)
        df_dict["possible_next_actions"] = feature_transform(
            pna_features,
            pa_transform,
            is_next_with_multi_steps=is_multi_steps,
            replace_when_terminal=[],
            terminal=batch.terminal,
        )

    df = pd.DataFrame(df_dict)
    # validate df
    validate_mdp_ids_seq_nums(df)
    # shuffling (sample the whole batch)
    df = df.reindex(np.random.permutation(df.index))
    return df
Beispiel #20
0
def run_test_online_episode(
    env: Env__Union,
    model: ModelManager__Union,
    num_train_episodes: int,
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool,
):
    """
    Run an online learning test. At the end of each episode training is run on the trajectory.
    """
    env = env.value
    # pyre-fixme[16]: Module `pl` has no attribute `seed_everything`.
    pl.seed_everything(SEED)
    env.seed(SEED)
    env.action_space.seed(SEED)

    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )
    policy = manager.create_policy(serving=False)

    device = torch.device("cuda") if use_gpu else torch.device("cpu")

    agent = Agent.create_for_env(env, policy, device=device)

    # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`.
    if isinstance(trainer, pl.LightningModule):
        # pyre-fixme[16]: Module `pl` has no attribute `Trainer`.
        pl_trainer = pl.Trainer(max_epochs=1,
                                gpus=int(use_gpu),
                                deterministic=True)
        dataset = EpisodicDataset(env=env,
                                  agent=agent,
                                  num_episodes=num_train_episodes,
                                  seed=SEED)
        pl_trainer.fit(trainer, dataset)
    else:
        post_episode_callback = train_post_episode(env, trainer, use_gpu)
        _ = train_policy(
            env,
            policy,
            num_train_episodes,
            post_step=None,
            post_episode=post_episode_callback,
            use_gpu=use_gpu,
        )

    eval_rewards = evaluate_for_n_episodes(
        n=num_eval_episodes,
        env=env,
        agent=agent,
        max_steps=env.max_steps,
        num_processes=1,
    ).squeeze(1)
    assert (
        eval_rewards.mean() >= passing_score_bar
    ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
Beispiel #21
0
def run_test_replay_buffer(
    env: Env__Union,
    model: ModelManager__Union,
    replay_memory_size: int,
    train_every_ts: int,
    train_after_ts: int,
    num_train_episodes: int,
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool,
    minibatch_size: Optional[int] = None,
):
    """
    Run an online learning test with a replay buffer. The replay buffer is pre-filled, then the training starts.
    Each transition is added to the replay buffer immediately after it takes place.
    """
    env = env.value
    # pyre-fixme[16]: Module `pl` has no attribute `seed_everything`.
    pl.seed_everything(SEED)
    env.seed(SEED)
    env.action_space.seed(SEED)

    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )
    training_policy = manager.create_policy(serving=False)

    # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`.
    if not isinstance(trainer, pl.LightningModule):
        if minibatch_size is None:
            minibatch_size = trainer.minibatch_size
        assert minibatch_size == trainer.minibatch_size

    assert minibatch_size is not None

    replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size,
                                 batch_size=minibatch_size)

    device = torch.device("cuda") if use_gpu else torch.device("cpu")
    # first fill the replay buffer using random policy
    train_after_ts = max(train_after_ts, minibatch_size)
    fill_replay_buffer(env=env,
                       replay_buffer=replay_buffer,
                       desired_size=train_after_ts)

    agent = Agent.create_for_env(env, policy=training_policy, device=device)
    # TODO: Simplify this setup by creating LightningDataModule
    dataset = ReplayBufferDataset.create_for_trainer(
        trainer,
        env,
        agent,
        replay_buffer,
        batch_size=minibatch_size,
        training_frequency=train_every_ts,
        num_episodes=num_train_episodes,
        max_steps=200,
        device=device,
    )
    data_loader = torch.utils.data.DataLoader(dataset,
                                              collate_fn=identity_collate)
    # pyre-fixme[16]: Module `pl` has no attribute `Trainer`.
    pl_trainer = pl.Trainer(max_epochs=1, gpus=int(use_gpu))
    # Note: the fit() function below also evaluates the agent along the way
    # and adds the new transitions to the replay buffer, so it is training
    # on incrementally larger and larger buffers.
    pl_trainer.fit(trainer, data_loader)

    # TODO: Also check train_reward

    serving_policy = manager.create_policy(serving=True)

    eval_rewards = eval_policy(env,
                               serving_policy,
                               num_eval_episodes,
                               serving=True)
    assert (
        eval_rewards.mean() >= passing_score_bar
    ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
Beispiel #22
0
def run_test(
    env: str,
    model: ModelManager__Union,
    replay_memory_size: int,
    train_every_ts: int,
    train_after_ts: int,
    num_train_episodes: int,
    max_steps: Optional[int],
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool,
):
    env = EnvFactory.make(env)
    env.seed(SEED)
    env.action_space.seed(SEED)
    normalization = build_normalizer(env)
    logger.info(f"Normalization is {normalization}")

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )

    replay_buffer = ReplayBuffer.create_from_env(
        env=env,
        replay_memory_size=replay_memory_size,
        batch_size=trainer.minibatch_size,
    )

    device = torch.device("cuda") if use_gpu else None
    post_step = train_with_replay_buffer_post_step(
        replay_buffer=replay_buffer,
        trainer=trainer,
        training_freq=train_every_ts,
        batch_size=trainer.minibatch_size,
        replay_burnin=train_after_ts,
        device=device,
    )

    training_policy = manager.create_policy(serving=False)
    agent = Agent.create_for_env(env,
                                 policy=training_policy,
                                 post_transition_callback=post_step,
                                 device=device)

    train_rewards = []
    for i in range(num_train_episodes):
        ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps)
        train_rewards.append(ep_reward)
        logger.info(f"Finished training episode {i} with reward {ep_reward}.")

    assert train_rewards[-1] >= passing_score_bar, (
        f"reward after {len(train_rewards)} episodes is {train_rewards[-1]},"
        f"less than < {passing_score_bar}...\n"
        f"Full reward history: {train_rewards}")

    logger.info("============Train rewards=============")
    logger.info(train_rewards)

    def gym_to_reagent_serving(
            obs: np.array) -> Tuple[torch.Tensor, torch.Tensor]:
        obs_tensor = torch.tensor(obs).float().unsqueeze(0)
        presence_tensor = torch.ones_like(obs_tensor)
        return (obs_tensor, presence_tensor)

    serving_policy = manager.create_policy(serving=True)
    agent = Agent.create_for_env(env,
                                 policy=serving_policy,
                                 obs_preprocessor=gym_to_reagent_serving)

    eval_rewards = []
    for i in range(num_eval_episodes):
        ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps)
        eval_rewards.append(ep_reward)
        logger.info(f"Finished eval episode {i} with reward {ep_reward}.")

    assert np.mean(eval_rewards) >= passing_score_bar, (
        f"Predictor reward is {np.mean(eval_rewards)},"
        f"less than < {passing_score_bar}...\n"
        f"Full eval rewards: {eval_rewards}.")

    logger.info("============Eval rewards==============")
    logger.info(eval_rewards)
def train_mdnrnn_and_compute_feature_stats(
    env_name: str,
    model: ModelManager__Union,
    num_train_transitions: int,
    num_test_transitions: int,
    seq_len: int,
    batch_size: int,
    num_train_epochs: int,
    use_gpu: bool,
    saved_mdnrnn_path: Optional[str] = None,
):
    """Train MDNRNN Memory Network and compute feature importance/sensitivity."""
    env: gym.Env = Gym(env_name=env_name)
    env.seed(SEED)

    manager = model.value
    trainer = manager.build_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=build_normalizer(env),
    )

    device = "cuda" if use_gpu else "cpu"
    # pyre-fixme[6]: Expected `device` for 2nd param but got `str`.
    trainer_preprocessor = make_replay_buffer_trainer_preprocessor(trainer, device, env)
    test_replay_buffer = ReplayBuffer(
        replay_capacity=num_test_transitions,
        batch_size=batch_size,
        stack_size=seq_len,
        return_everything_as_stack=True,
    )
    random_policy = make_random_policy_for_env(env)
    agent = Agent.create_for_env(env, policy=random_policy)
    fill_replay_buffer(env, test_replay_buffer, num_test_transitions, agent)

    if saved_mdnrnn_path is None:
        # train from scratch
        trainer = train_mdnrnn(
            env=env,
            trainer=trainer,
            trainer_preprocessor=trainer_preprocessor,
            num_train_transitions=num_train_transitions,
            seq_len=seq_len,
            batch_size=batch_size,
            num_train_epochs=num_train_epochs,
            test_replay_buffer=test_replay_buffer,
        )
    else:
        # load a pretrained model, and just evaluate it
        trainer.memory_network.mdnrnn.load_state_dict(torch.load(saved_mdnrnn_path))

    with torch.no_grad():
        trainer.memory_network.mdnrnn.eval()
        test_batch = test_replay_buffer.sample_transition_batch(
            batch_size=test_replay_buffer.size
        )
        preprocessed_test_batch = trainer_preprocessor(test_batch)
        feature_importance = calculate_feature_importance(
            env=env,
            trainer=trainer,
            use_gpu=use_gpu,
            test_batch=preprocessed_test_batch,
        )

        feature_sensitivity = calculate_feature_sensitivity(
            env=env,
            trainer=trainer,
            use_gpu=use_gpu,
            test_batch=preprocessed_test_batch,
        )

        trainer.memory_network.mdnrnn.train()
    return feature_importance, feature_sensitivity
Beispiel #24
0
def run_test(
    env_name: str,
    model: ModelManager__Union,
    replay_memory_size: int,
    train_every_ts: int,
    train_after_ts: int,
    num_train_episodes: int,
    max_steps: Optional[int],
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool,
):
    env = EnvFactory.make(env_name)
    env.seed(SEED)
    env.action_space.seed(SEED)
    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    try:
        # pyre-fixme[16]: `Env` has no attribute `state_feature_config_provider`.
        manager.state_feature_config_provider = env.state_feature_config_provider
        logger.info(
            f"Using environment's state_feature_config_provider.\n"
            f"{manager.state_feature_config_provider}"
        )
    except AttributeError:
        logger.info("state_feature_config_provider override not applicable")

    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )
    training_policy = manager.create_policy(serving=False)

    replay_buffer = ReplayBuffer.create_from_env(
        env=env,
        replay_memory_size=replay_memory_size,
        batch_size=trainer.minibatch_size,
    )

    device = torch.device("cuda") if use_gpu else torch.device("cpu")
    # first fill the replay buffer to burn_in
    train_after_ts = max(train_after_ts, trainer.minibatch_size)
    fill_replay_buffer(
        env=env, replay_buffer=replay_buffer, desired_size=train_after_ts
    )

    post_step = train_with_replay_buffer_post_step(
        replay_buffer=replay_buffer,
        env=env,
        trainer=trainer,
        training_freq=train_every_ts,
        batch_size=trainer.minibatch_size,
        device=device,
    )

    agent = Agent.create_for_env(
        env, policy=training_policy, post_transition_callback=post_step, device=device
    )

    writer = SummaryWriter()
    with summary_writer_context(writer):
        train_rewards = []
        for i in range(num_train_episodes):
            trajectory = run_episode(
                env=env, agent=agent, mdp_id=i, max_steps=max_steps
            )
            ep_reward = trajectory.calculate_cumulative_reward()
            train_rewards.append(ep_reward)
            logger.info(
                f"Finished training episode {i} (len {len(trajectory)})"
                f" with reward {ep_reward}."
            )

    logger.info("============Train rewards=============")
    logger.info(train_rewards)
    logger.info(f"average: {np.mean(train_rewards)};\tmax: {np.max(train_rewards)}")

    # Check whether the max score passed the score bar; we explore during training
    # the return could be bad (leading to flakiness in C51 and QRDQN).
    assert np.max(train_rewards) >= passing_score_bar, (
        f"max reward ({np.max(train_rewards)})after training for "
        f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n"
    )

    serving_policy = manager.create_policy(serving=True)
    agent = Agent.create_for_env_with_serving_policy(env, serving_policy)

    eval_rewards = evaluate_for_n_episodes(
        n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps
    ).squeeze(1)

    logger.info("============Eval rewards==============")
    logger.info(eval_rewards)
    logger.info(f"average: {np.mean(eval_rewards)};\tmax: {np.max(eval_rewards)}")
    assert np.mean(eval_rewards) >= passing_score_bar, (
        f"Predictor reward is {np.mean(eval_rewards)},"
        f"less than < {passing_score_bar}.\n"
    )
Beispiel #25
0
def run_test_offline(
    env_name: str,
    model: ModelManager__Union,
    replay_memory_size: int,
    num_batches_per_epoch: int,
    num_train_epochs: int,
    passing_score_bar: float,
    num_eval_episodes: int,
    minibatch_size: int,
    use_gpu: bool,
):
    env = Gym(env_name=env_name)
    env.seed(SEED)
    env.action_space.seed(SEED)
    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    trainer = manager.build_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )

    # first fill the replay buffer to burn_in
    replay_buffer = ReplayBuffer(
        replay_capacity=replay_memory_size, batch_size=minibatch_size
    )
    # always fill full RB
    random_policy = make_random_policy_for_env(env)
    agent = Agent.create_for_env(env, policy=random_policy)
    fill_replay_buffer(
        env=env,
        replay_buffer=replay_buffer,
        desired_size=replay_memory_size,
        agent=agent,
    )

    device = torch.device("cuda") if use_gpu else None
    dataset = OfflineReplayBufferDataset.create_for_trainer(
        trainer,
        env,
        replay_buffer,
        batch_size=minibatch_size,
        num_batches=num_batches_per_epoch,
        device=device,
    )
    data_loader = torch.utils.data.DataLoader(dataset, collate_fn=identity_collate)
    pl_trainer = pl.Trainer(
        max_epochs=num_train_epochs,
        gpus=int(use_gpu),
        deterministic=True,
        default_root_dir=f"lightning_log_{str(uuid.uuid4())}",
    )
    pl_trainer.fit(trainer, data_loader)

    logger.info(f"Evaluating after training for {num_train_epochs} epochs: ")
    eval_rewards = evaluate_cem(env, manager, trainer, num_eval_episodes)
    mean_rewards = np.mean(eval_rewards)
    assert (
        mean_rewards >= passing_score_bar
    ), f"{mean_rewards} doesn't pass the bar {passing_score_bar}."