def reset(self):
     """Start a new episode."""
     # Build a new game and retrieve its first set of state/reward/discount.
     self._current_game = self._game_factory()
     self._state = environment.StepType.FIRST
     # Collect environment returns from starting the game and update state.
     observations, reward, discount = self._current_game.its_showtime()
     self._update_for_game_step(observations, reward, discount)
     return environment.TimeStep(step_type=self._state,
                                 reward=None,
                                 discount=None,
                                 observation=self.last_observations)
  def step(self, action):
    """Apply action, step the world forward, and return observations."""

    if self._action_size == 1:
      # Handle a float or single-element arrays of any dimensionality. Strictly
      # speaking, a single-element list will also work, but it's best not to
      # confuse matters in the docstring with this option.
      all_actions = [np.asarray(action).item()]
    else:
      all_actions = [np.asarray(a).item() for a in action]

    if len(all_actions) != self._action_size:
      raise RuntimeError("A pycolab Environment adapter's step method "
                         'was called with actions that were not compatible '
                         'with what the pycolab game expects.')

    # Clear episode internals and start a new episode, if episode ended or if
    # the game was not already underway.
    if self._state == environment.StepType.LAST:
      self._drop_last_episode()
    if self._current_game is None:
      return self.reset()

    # Execute the action in pycolab.
    action = all_actions[0] if self._action_size == 1 else all_actions
    observations, reward, discount = self._current_game.play(action)
    self._update_for_game_step(observations, reward, discount)

    # Check the current status of the game.
    if self._game_over:
      self._state = environment.StepType.LAST
    else:
      self._state = environment.StepType.MID

    return environment.TimeStep(
        step_type=self._state,
        reward=self._last_reward,
        discount=self._last_discount,
        observation=self.last_observations)