Ejemplo n.º 1
0
def run_episode(env: Env,
                agent: Agent,
                mdp_id: int = 0,
                max_steps: Optional[int] = None) -> Trajectory:
    """
    Return sum of rewards from episode.
    After max_steps (if specified), the environment is assumed to be terminal.
    Can also specify the mdp_id and gamma of episode.
    """
    trajectory = Trajectory()
    obs = env.reset()
    terminal = False
    num_steps = 0
    while not terminal:
        action = agent.act(obs)
        next_obs, reward, terminal, _ = env.step(action)
        if max_steps is not None and num_steps >= max_steps:
            terminal = True

        # Only partially filled. Agent can fill in more fields.
        transition = Transition(
            mdp_id=mdp_id,
            sequence_number=num_steps,
            observation=obs,
            action=action,
            reward=reward,
            terminal=terminal,
        )
        agent.post_step(transition)
        trajectory.add_transition(transition)
        SummaryWriterContext.increase_global_step()
        obs = next_obs
        num_steps += 1
    return trajectory
Ejemplo n.º 2
0
def _create_replay_buffer_and_insert(env: EnvWrapper):
    env.seed(1)
    replay_buffer = ReplayBuffer(replay_capacity=6, batch_size=1)
    replay_buffer_inserter = make_replay_buffer_inserter(env)
    obs = env.reset()
    inserted = []
    terminal = False
    i = 0
    while not terminal and i < 5:
        logger.info(f"Iteration: {i}")
        action = env.action_space.sample()
        next_obs, reward, terminal, _ = env.step(action)
        inserted.append(
            {
                "observation": obs,
                "action": action,
                "reward": reward,
                "terminal": terminal,
            }
        )
        transition = Transition(
            mdp_id=0,
            sequence_number=i,
            observation=obs,
            action=action,
            reward=reward,
            terminal=terminal,
            log_prob=0.0,
        )
        replay_buffer_inserter(replay_buffer, transition)
        obs = next_obs
        i += 1

    return replay_buffer, inserted
Ejemplo n.º 3
0
    def __iter__(self):
        mdp_id = 0
        global_num_steps = 0
        rewards = []

        # TODO: We probably should put member vars into local vars to
        # reduce indirection, improving perf

        while self._num_episodes is None or mdp_id < self._num_episodes:
            obs = self._env.reset()
            possible_actions_mask = self._env.possible_actions_mask
            terminal = False
            num_steps = 0
            episode_reward_sum = 0
            trajectory = Trajectory()
            while not terminal:
                action, log_prob = self._agent.act(obs, possible_actions_mask)
                next_obs, reward, terminal, info = self._env.step(action)
                next_possible_actions_mask = self._env.possible_actions_mask
                if self._max_steps is not None and num_steps >= self._max_steps:
                    terminal = True

                # Only partially filled. Agent can fill in more fields.
                transition = Transition(
                    mdp_id=mdp_id,
                    sequence_number=num_steps,
                    observation=obs,
                    action=action,
                    reward=float(reward),
                    terminal=bool(terminal),
                    log_prob=log_prob,
                    possible_actions_mask=possible_actions_mask,
                )
                trajectory.add_transition(transition)
                self._replay_buffer_inserter(self._replay_buffer, transition)
                episode_reward_sum += reward
                if (global_num_steps % self._training_frequency == 0
                        and self._replay_buffer.size >= self._batch_size):
                    train_batch = self._replay_buffer.sample_transition_batch(
                        batch_size=self._batch_size)
                    if self._trainer_preprocessor:
                        train_batch = self._trainer_preprocessor(train_batch)
                    yield train_batch

                obs = next_obs
                possible_actions_mask = next_possible_actions_mask
                num_steps += 1
                global_num_steps += 1
            if self._post_episode_callback:
                self._post_episode_callback(trajectory, info)

            rewards.append(episode_reward_sum)
            mdp_id += 1
            print()
            print("Training episode: " + str(mdp_id) +
                  ", total episode reward = " + str(episode_reward_sum))

        print("Episode rewards during training:")
        print(rewards)
Ejemplo n.º 4
0
 def post_step(self, transition: Transition):
     """ to be called after step(action) """
     if self.post_transition_callback is not None:
         transition.log_prob = self._log_prob
         # pyre-fixme[29]: `Optional[typing.Callable[[Transition], None]]` is not
         #  a function.
         self.post_transition_callback(transition)
     self._reset_internal_states()
Ejemplo n.º 5
0
async def async_run_episode(
    env: EnvWrapper,
    agent: Agent,
    mdp_id: int = 0,
    max_steps: Optional[int] = None,
    fill_info: bool = False,
) -> Trajectory:
    """
    NOTE: this funciton is an async coroutine in order to support async env.step(). If you are using
        it with regular env.step() method, use non-async run_episode(), which wraps this function.
    Return sum of rewards from episode.
    After max_steps (if specified), the environment is assumed to be terminal.
    Can also specify the mdp_id and gamma of episode.
    """
    trajectory = Trajectory()
    obs = env.reset()
    possible_actions_mask = env.possible_actions_mask
    terminal = False
    num_steps = 0
    step_is_coroutine = asyncio.iscoroutinefunction(env.step)
    while not terminal:
        action, log_prob = agent.act(obs, possible_actions_mask)
        if step_is_coroutine:
            next_obs, reward, terminal, info = await env.step(action)
        else:
            next_obs, reward, terminal, info = env.step(action)
        if not fill_info:
            info = None
        next_possible_actions_mask = env.possible_actions_mask
        if max_steps is not None and num_steps >= max_steps:
            terminal = True

        # Only partially filled. Agent can fill in more fields.
        transition = Transition(
            mdp_id=mdp_id,
            sequence_number=num_steps,
            observation=obs,
            action=action,
            reward=float(reward),
            terminal=bool(terminal),
            log_prob=log_prob,
            possible_actions_mask=possible_actions_mask,
            info=info,
        )
        agent.post_step(transition)
        trajectory.add_transition(transition)
        SummaryWriterContext.increase_global_step()
        obs = next_obs
        possible_actions_mask = next_possible_actions_mask
        num_steps += 1
    agent.post_episode(trajectory)
    return trajectory
Ejemplo n.º 6
0
    def __iter__(self):
        mdp_id = 0
        global_num_steps = 0

        # TODO: We probably should put member vars into local vars to
        # reduce indirection, improving perf

        while self._num_episodes is None or mdp_id < self._num_episodes:
            obs = self._env.reset()
            possible_actions_mask = self._env.possible_actions_mask
            terminal = False
            num_steps = 0
            while not terminal:
                action, log_prob = self._agent.act(obs, possible_actions_mask)
                next_obs, reward, terminal, _ = self._env.step(action)
                next_possible_actions_mask = self._env.possible_actions_mask
                if self._max_steps is not None and num_steps >= self._max_steps:
                    terminal = True

                # Only partially filled. Agent can fill in more fields.
                transition = Transition(
                    mdp_id=mdp_id,
                    sequence_number=num_steps,
                    observation=obs,
                    action=action,
                    reward=float(reward),
                    terminal=bool(terminal),
                    log_prob=log_prob,
                    possible_actions_mask=possible_actions_mask,
                )
                self._replay_buffer_inserter(self._replay_buffer, transition)
                if (
                    global_num_steps % self._training_frequency == 0
                    and self._replay_buffer.size >= self._batch_size
                ):
                    train_batch = self._replay_buffer.sample_transition_batch(
                        batch_size=self._batch_size
                    )
                    if self._trainer_preprocessor:
                        train_batch = self._trainer_preprocessor(train_batch)
                    yield train_batch

                obs = next_obs
                possible_actions_mask = next_possible_actions_mask
                num_steps += 1
                global_num_steps += 1

            mdp_id += 1
Ejemplo n.º 7
0
    def __call__(self, replay_buffer: ReplayBuffer, transition: Transition):
        transition_dict = transition.asdict()
        obs = transition_dict.pop("observation")
        user = obs["user"]

        kwargs = {}

        if self.box_keys or self.discrete_keys:
            doc_obs = obs["doc"]
            for k in self.box_keys:
                kwargs[f"doc_{k}"] = np.stack([v[k] for v in doc_obs.values()])
            for k in self.discrete_keys:
                kwargs[f"doc_{k}"] = np.array([v[k] for v in doc_obs.values()])
        else:
            kwargs["doc"] = np.stack(list(obs["doc"].values()))

        # Augmentation

        if self.augmentation_box_keys or self.augmentation_discrete_keys:
            aug_obs = obs["augmentation"]
            for k in self.augmentation_box_keys:
                kwargs[f"augmentation_{k}"] = np.stack(
                    [v[k] for v in aug_obs.values()])
            for k in self.augmentation_discrete_keys:
                kwargs[f"augmentation_{k}"] = np.array(
                    [v[k] for v in aug_obs.values()])

        # Responses

        response = obs["response"]
        # We need to handle None below because the first state won't have response
        for k, d in self.response_box_keys:
            if response is not None:
                kwargs[f"response_{k}"] = np.stack([v[k] for v in response])
            else:
                kwargs[f"response_{k}"] = np.zeros((self.num_responses, *d),
                                                   dtype=np.float32)
        for k, _n in self.response_discrete_keys:
            if response is not None:
                kwargs[f"response_{k}"] = np.array([v[k] for v in response])
            else:
                kwargs[f"response_{k}"] = np.zeros((self.num_responses, ),
                                                   dtype=np.int64)

        transition_dict.update(kwargs)
        replay_buffer.add(observation=user, **transition_dict)
Ejemplo n.º 8
0
def run_episode(env: EnvWrapper,
                agent: Agent,
                mdp_id: int = 0,
                max_steps: Optional[int] = None) -> Trajectory:
    """
    Return sum of rewards from episode.
    After max_steps (if specified), the environment is assumed to be terminal.
    Can also specify the mdp_id and gamma of episode.
    """
    trajectory = Trajectory()
    # pyre-fixme[16]: `EnvWrapper` has no attribute `reset`.
    obs = env.reset()
    possible_actions_mask = env.possible_actions_mask
    terminal = False
    num_steps = 0
    while not terminal:
        action, log_prob = agent.act(obs, possible_actions_mask)
        # pyre-fixme[16]: `EnvWrapper` has no attribute `step`.
        next_obs, reward, terminal, _ = env.step(action)
        next_possible_actions_mask = env.possible_actions_mask
        if max_steps is not None and num_steps >= max_steps:
            terminal = True

        # Only partially filled. Agent can fill in more fields.
        transition = Transition(
            mdp_id=mdp_id,
            sequence_number=num_steps,
            observation=obs,
            action=action,
            reward=float(reward),
            terminal=bool(terminal),
            log_prob=log_prob,
            possible_actions_mask=possible_actions_mask,
        )
        agent.post_step(transition)
        trajectory.add_transition(transition)
        SummaryWriterContext.increase_global_step()
        obs = next_obs
        possible_actions_mask = next_possible_actions_mask
        num_steps += 1
    agent.post_episode(trajectory)
    return trajectory
Ejemplo n.º 9
0
 def __call__(self, replay_buffer: ReplayBuffer, transition: Transition):
     replay_buffer.add(**transition.asdict())