Exemple #1
0
 def test_action_space_conversion(self):
     env = make_recsim_env(
         config={"convert_to_discrete_action_space": True})
     self.assertIsInstance(env.action_space, gym.spaces.Discrete)
     env.reset()
     action = env.action_space.sample()
     env.step(action)
Exemple #2
0
 def test_observation_space(self):
     env = make_recsim_env(config={})
     obs = env.reset()
     self.assertTrue(env.observation_space.contains(obs),
                     f"{env.observation_space} doesn't contain {obs}")
     new_obs, _, _, _ = env.step(env.action_space.sample())
     self.assertTrue(env.observation_space.contains(new_obs))
Exemple #3
0
def main(choice_model, experiment_name):
    assert choice_model in ["slateq", "greedy", "random"]
    writer = SummaryWriter(
        log_dir=
        f"/tmp/logs/slateq_from_scratch/{choice_model}_{experiment_name}")

    slate_size = 3
    buf = ReplayBuffer()
    env = make_recsim_env({"slate_size": slate_size})

    user_choice_model = UserChoiceModel()
    optimizer = torch.optim.Adam(user_choice_model.parameters(), lr=0.01)
    loss_fn = nn.CrossEntropyLoss(reduction="sum")

    q_model = QModel()
    q_optimizer = torch.optim.Adam(q_model.parameters())
    q_loss_fn = nn.MSELoss(reduction="sum")

    for idx_episode in range(10000):
        episode_reward = 0.0
        obs = env.reset()
        done = False
        entry = {
            "state_user": None,
            "state_doc": None,
            "action": None,
            "done": None,
            "click": None,
            "myopic_reward": None,
            "next_state_user": None,
            "next_state_doc": None,
            "next_action": None,
        }
        last_entry = {}
        episode_step = 0
        while not done:
            entry = {}
            entry["state_user"] = pack_state_user(obs)
            entry["state_doc"] = pack_state_doc(obs)

            if choice_model == "random":
                action = compute_action_random(obs)
            elif choice_model == "greedy":
                action = compute_action_choice_model(obs, user_choice_model)
            else:
                action = compute_action(obs, user_choice_model, q_model)

            entry["action"] = np.array(action, dtype=np.int32)

            episode_step += 1
            obs, reward, done, info = env.step(action)
            if last_entry:
                last_entry["next_action"] = np.array(action, dtype=np.int32)
                last_entry["next_state_user"] = pack_state_user(obs)
                last_entry["next_state_doc"] = pack_state_doc(obs)
                buf.add(last_entry)

            entry["done"] = np.array(done, dtype=np.bool)

            click_idx, myopic_reward = parse_response(obs["response"])
            episode_reward += myopic_reward

            entry["click"] = np.array(click_idx, dtype=np.float32)
            entry["myopic_reward"] = np.array(myopic_reward, dtype=np.float32)

            last_entry = entry

        if last_entry:
            # env is "done", just put some random values here
            action = [0] * slate_size
            last_entry["next_action"] = np.array(action, dtype=np.int32)
            last_entry["next_state_user"] = pack_state_user(obs)
            last_entry["next_state_doc"] = pack_state_doc(obs)
            buf.add(last_entry)

        writer.add_scalar(
            "episode_step",
            episode_step,
            idx_episode + 1,
        )
        writer.add_scalar(
            "episode_reward",
            episode_reward,
            idx_episode + 1,
        )
        writer.add_scalar(
            "choice_model_a",
            user_choice_model.a.item(),
            idx_episode + 1,
        )
        writer.add_scalar(
            "choice_model_b",
            user_choice_model.b.item(),
            idx_episode + 1,
        )
        if (idx_episode + 1) % 10 == 0:
            train_user_choice_model(user_choice_model,
                                    loss_fn,
                                    optimizer,
                                    buf,
                                    batch_size=4,
                                    num_iters=100)
            train_q_model(
                q_model,
                user_choice_model,
                q_loss_fn,
                q_optimizer,
                buf,
                batch_size=4,
                num_iters=30,
            )
Exemple #4
0
 def test_double_action_space_conversion_raises_exception(self):
     env = make_recsim_env(
         config={"convert_to_discrete_action_space": True})
     with self.assertRaises(UnsupportedSpaceException):
         env = MultiDiscreteToDiscreteActionWrapper(env)