Ejemplo n.º 1
0
def run_episode(env: gym.Env, agent: Agent, training: bool, render_mode: str) \
                -> List[Trajectory]:
    '''
    Runs a single episode of a single-agent rl loop until termination.
    :param env: OpenAI gym environment
    :param agent: Agent policy used to take actions in the environment and to process simulated experiences
    :param training: (boolean) Whether the agents will learn from the experience they recieve
    :param render_mode: Feature not implemented (yet!)
    :returns: Episode trajectory. list of (o,a,r,o')
    '''
    observation = env.reset()
    done = False
    trajectory = Trajectory(env_type=EnvType.SINGLE_AGENT)
    legal_actions: List = None
    while not done:
        if agent.requires_environment_model:
            action = agent.model_based_take_action(deepcopy(env),
                                                   observation,
                                                   player_index=0)
        else:
            action = agent.model_free_take_action(observation, legal_actions)
        succ_observation, reward, done, info = env.step(action)
        trajectory.add_timestep(observation, action, reward, succ_observation,
                                done)
        if training:
            agent.handle_experience(observation, action, reward,
                                    succ_observation, done)
        observation = succ_observation

        if 'legal_actions' in info: legal_actions = info['legal_actions']

    return trajectory
Ejemplo n.º 2
0
def _check_reset_seed(env: gym.Env, seed: Optional[int] = None) -> None:
    """
    Check that the environment can be reset with a random seed.
    """
    signature = inspect.signature(env.reset)
    assert (
        "seed" in signature.parameters or "kwargs" in signature.parameters
    ), "The environment cannot be reset with a random seed. This behavior will be deprecated in the future."

    try:
        env.reset(seed=seed)
    except TypeError as e:
        raise AssertionError(
            "The environment cannot be reset with a random seed, even though `seed` or `kwargs` "
            "appear in the signature. This should never happen, please report this issue. "
            "The error was: " + str(e))

    if env.unwrapped.np_random is None:
        logger.warn(
            "Resetting the environment did not result in seeding its random number generator. "
            "This is likely due to not calling `super().reset(seed=seed)` in the `reset` method. "
            "If you do not use the python-level random number generator, this is not a problem."
        )

    seed_param = signature.parameters.get("seed")
    # Check the default value is None
    if seed_param is not None and seed_param.default is not None:
        logger.warn(
            "The default seed argument in reset should be `None`, "
            "otherwise the environment will by default always be deterministic"
        )
Ejemplo n.º 3
0
def _check_render(env: gym.Env,
                  warn: bool = True,
                  headless: bool = False) -> None:
    """
    Check the declared render modes and the `render()`/`close()`
    method of the environment.

    :param env: The environment to check
    :param warn: Whether to output additional warnings
    :param headless: Whether to disable render modes
        that require a graphical interface. False by default.
    """
    render_modes = env.metadata.get("render.modes")
    if render_modes is None:
        if warn:
            warnings.warn(
                "No render modes was declared in the environment "
                " (env.metadata['render.modes'] is None or not defined), "
                "you may have trouble when calling `.render()`")

    else:
        # Don't check render mode that require a
        # graphical interface (useful for CI)
        if headless and "human" in render_modes:
            render_modes.remove("human")
        # Check all declared render modes
        for render_mode in render_modes:
            env.render(mode=render_mode)
        env.close()
Ejemplo n.º 4
0
def iterate_batches(env: gym.Env, net: nn.Module, batch_size: int):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)

    while True:
        obs_v = torch.FloatTensor([obs])
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.data.numpy()[0]

        action = np.random.choice(len(act_probs), p=act_probs)
        next_obs, reward, done, _ = env.step(action)

        episode_reward += reward
        step = EpisodeStep(obs, action)
        episode_steps.append(step)

        if done:
            e = Episode(episode_reward, episode_steps)
            batch.append(e)
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()

            if len(batch) == batch_size:
                yield batch
                batch = []

        obs = next_obs
Ejemplo n.º 5
0
def train(agent: Agent, env: gym.Env, episodes: int, render=True):
    """Train `agent` in `env` for `episodes`

    Args:
        agent (Agent): Agent to train
        env (gym.Env): Environment to train the agent
        episodes (int): Number of episodes to train
        render (bool): True=Enable/False=Disable rendering; Default=True
    """

    for episode in range(episodes):
        done = False
        state = env.reset()
        total_reward = 0
        rewards = []
        states = []
        actions = []
        while not done:
            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            rewards.append(reward)
            states.append(state)
            actions.append(action)
            state = next_state
            total_reward += reward
            if render:
                env.render()
            if done:
                agent.learn(states, rewards, actions)
                print("\n")
            print(f"Episode#:{episode} ep_reward:{total_reward}", end="\r")
Ejemplo n.º 6
0
def run_episode(env: Env,
                agent: Agent,
                mdp_id: int = 0,
                max_steps: Optional[int] = None) -> Trajectory:
    """
    Return sum of rewards from episode.
    After max_steps (if specified), the environment is assumed to be terminal.
    Can also specify the mdp_id and gamma of episode.
    """
    trajectory = Trajectory()
    obs = env.reset()
    terminal = False
    num_steps = 0
    while not terminal:
        action = agent.act(obs)
        next_obs, reward, terminal, _ = env.step(action)
        if max_steps is not None and num_steps >= max_steps:
            terminal = True

        # Only partially filled. Agent can fill in more fields.
        transition = Transition(
            mdp_id=mdp_id,
            sequence_number=num_steps,
            observation=obs,
            action=action,
            reward=reward,
            terminal=terminal,
        )
        agent.post_step(transition)
        trajectory.add_transition(transition)
        SummaryWriterContext.increase_global_step()
        obs = next_obs
        num_steps += 1
    return trajectory
Ejemplo n.º 7
0
        def render_episode(env: gym.Env, model: tf.keras.Model,
                           max_steps: int):
            state = tf.constant(env.reset(), dtype=tf.float32)
            screen = env.render(mode='rgb_array')
            im = Image.fromarray(screen)

            images = [im]

            for i in range(1, max_steps + 1):
                state = tf.expand_dims(state, 0)
                qvalues = model(state)
                action = np.argmax(np.squeeze(qvalues))

                state, _, done, _ = env.step(action)
                state = tf.constant(state, dtype=tf.float32)

                # Render screen every 10 steps if you put % 10 there
                if i % 1 == 0:
                    screen = env.render(mode='rgb_array')
                    images.append(Image.fromarray(screen))

                if done:
                    break

            return images
Ejemplo n.º 8
0
    def gather_experience(self, env: gym.Env, time_limit: int) -> float:
        state = env.reset()
        done = False
        total_reward, reward, timesteps = 0, 0, 0

        while not done:
            # print(action_probs)
            action_probs = self.action_probs(state)
            # print(action_probs)

            action_chosen = np.random.choice(self.action_space, p=action_probs)

            self.memory_buffer.update(state, action_chosen, action_probs,
                                      reward)

            state, reward, done, info = env.step(action_chosen)
            total_reward += reward
            timesteps += 1

            if timesteps >= time_limit:
                break
        if not done:
            self.memory_buffer.rewards[-1] = -(1 / (1 - self.GAMMA))
            # self.memory_buffer.rewards[-1] = -5000
        env.close()
        # print("Episode of experience over, total reward = ", total_reward)
        return total_reward
Ejemplo n.º 9
0
def generate_gif(env: gym.Env,
                 n_steps: int = 20,
                 suffix: str = "smm_env_",
                 **kwargs) -> None:
    """Plot a few steps of an env and generate a .gif."""

    tmp_dir = tempfile.TemporaryDirectory()
    _ = env.reset()

    for s in tqdm(range(n_steps)):
        obs, _, _, _ = env.step(5)
        fig, ax = plot_smm_obs(obs, **kwargs)
        fig.suptitle(f"Step: {s}")
        fig.tight_layout()
        fig.savefig(f'{os.path.join(tmp_dir.name, str(s))}.png')
        plt.close('all')

    fns = glob.glob(f'{tmp_dir.name}/*.png')
    sorted_idx = np.argsort([
        int(f.split(tmp_dir.name)[1].split('.png')[0].split('/')[1])
        for f in fns
    ])
    fns = np.array(fns)[sorted_idx]
    output_path = f"{suffix}replay.gif"
    images = [imageio.imread(f) for f in fns]
    imageio.mimsave(output_path, images, duration=0.1, subrectangles=True)

    tmp_dir.cleanup()
Ejemplo n.º 10
0
def train_setup(agent: Agent, env: gym.Env):
    num_games = 150
    n_step = 0
    best_score = 0
    scores = []

    for i in range(num_games):
        done = False
        state = env.reset()
        score = 0

        while not done:
            action = agent.choose_action(state)
            new_state, reward, done, _ = env.step(action)
            n_step += 1
            score += reward

            agent.remember(state, action, reward, new_state, int(done))
            agent.learn()
            state = new_state
            if score == 200:
                print(f"Score maxed")
                done = True

        scores.append(score)

        avg_score = np.mean(scores[-100:])
        print(
            f"Game: {i}\tScore: {score}\tEpsilon: {agent.epsilon}'\tAverage score: {avg_score}"
        )
        if avg_score > best_score:
            print("New best average.")
            agent.save_models()
            best_score = avg_score
Ejemplo n.º 11
0
def set_weights(model: tf.keras.Model,
                env: gym.Env,
                num_steps: int = 100,
                set_bias: bool = True,
                set_weight: bool = True,
                env_max_steps: int = None):

    rewards = gather_data(env, num_steps, env_max_steps)

    output_layer: tf.keras.layers.Layer = model.layers[-1]
    while hasattr(output_layer, "layers"):
        output_layer = output_layer.layers[-1]
    W, b = output_layer.trainable_weights

    if set_bias:
        reward_mean = tf.reduce_mean(rewards)
        new_bias = tf.fill(b.shape, reward_mean)
        print(
            f" [Trickster] - Clever bias init: shape: {b.shape} value: {reward_mean}"
        )
        b.assign(new_bias)

    if set_weight:
        reward_std = tf.math.reduce_std(rewards)
        orthogonal_initializer = tf.keras.initializers.Orthogonal(
            gain=reward_std)
        print(
            f" [Trickster] - Clever weight init: shape: {W.shape} gain: {reward_std}"
        )
        new_weight = orthogonal_initializer(W.shape)
        W.assign(new_weight)

    env.reset()
def _create_replay_buffer_and_insert(env: gym.Env):
    env.seed(1)
    replay_buffer = ReplayBuffer.create_from_env(
        env, replay_memory_size=10, batch_size=1
    )
    replay_buffer_inserter = make_replay_buffer_inserter(env)
    obs = env.reset()
    inserted = []
    terminal = False
    i = 0
    while not terminal and i < 5:
        logger.info(f"Iteration: {i}")
        action = env.action_space.sample()
        next_obs, reward, terminal, _ = env.step(action)
        inserted.append(
            {
                "observation": obs,
                "action": action,
                "reward": reward,
                "terminal": terminal,
            }
        )
        log_prob = 0.0
        replay_buffer_inserter(replay_buffer, obs, action, reward, terminal, log_prob)
        obs = next_obs
        i += 1

    return replay_buffer, inserted
Ejemplo n.º 13
0
def play_episode(env: gym.Env, agent: Agent, replay_memory: ReplayMemory,
                 eps: float, batch_size: int) -> int:
    """Play an epsiode and train
    Args:
        env (gym.Env): gym environment (CartPole-v0)
        agent (Agent): agent will train and get action
        replay_memory (ReplayMemory): trajectory is saved here
        eps (float): 𝜺-greedy for exploration
        batch_size (int): batch size
    Returns:
        int: reward earned in this episode
    """
    s = env.reset()
    done = False
    total_reward = 0

    while not done:

        a = agent.get_action(s, eps)
        s2, r, done, info = env.step(a)

        total_reward += r

        if done:
            r = -1
        replay_memory.push(s, a, r, s2, done)

        if len(replay_memory) > batch_size:

            minibatch = replay_memory.pop(batch_size)
            train_helper(agent, minibatch, FLAGS.gamma)

        s = s2

    return total_reward
Ejemplo n.º 14
0
    def test(self,
             env: gym.Env,
             policy: Optional[Policy] = None,
             nb_episodes: int = 10,
             seed: Optional[int] = None,
             experiment_name: str = "",
             **_kwargs) -> History:
        """Test the agent."""
        if policy is None:
            policy = GreedyPolicy()
        policy.action_space = env.action_space
        policy.model = self

        set_seed(seed)
        set_env_seed(seed, env)

        history: List[List[AgentObs]] = []
        current_episode: List[AgentObs] = []
        for _ in range(nb_episodes):
            done = False
            s = env.reset()
            while not done:
                a = policy.get_action(s)
                sp, r, done, info = env.step(a)
                current_episode.append((s, a, r, sp))
                s = sp
            history.append(current_episode)
            current_episode = []
        return History(history,
                       is_training=False,
                       seed=seed,
                       name=experiment_name)
Ejemplo n.º 15
0
 def test(
     self,
     env: gym.Env,
     callbacks: Collection[Callback] = (),
     nb_episodes: int = 1000,
     experiment_name: str = "",
 ):
     """Test."""
     context = Context(experiment_name, self, env, callbacks)
     context.on_training_begin()
     for episode in range(nb_episodes):
         self.current_state = 0
         state = env.reset()
         done = False
         step = 0
         context.on_episode_begin(episode)
         while not done:
             action = self.choose_best_action(state)
             context.on_step_begin(step, action)
             state2, reward, done, info = env.step(action)
             observation = (state, action, reward, state2, done)
             self.do_pdfa_transition(observation)
             context.on_step_end(step, observation)
             state = state2
             step += 1
         context.on_episode_end(episode)
     context.on_training_end()
Ejemplo n.º 16
0
 def __init__(self,
              env,
              max_length=np.inf,
              dense_reward=True,
              save_fr=10,
              save_dest="state_box",
              render=False):
     Env.__init__(self)
     DictSerializable.__init__(self, DictSerializable.get_numpy_save())
     self.eval_env = env
     # Define action and observation space
     # They must be gym.spaces objects
     # Example when using discrete actions:
     if env is not None:
         self.action_space = self.eval_env.action_space
         # Example for using image as input:
         self.observation_space = self.eval_env.observation_space
     self._dense_reward = dense_reward
     self.partial_reward = 0.
     self.partial_length = 0
     self.returns = []
     self.episode_lengths = []
     self.successes = []
     self._unused = True
     self._max_length = max_length
     self.max_episode_steps = max_length
     self._save_fr = save_fr
     self._save_dest = save_dest
     self._render = render
Ejemplo n.º 17
0
 def train(
     self,
     env: gym.Env,
     callbacks: Collection[Callback] = (),
     nb_episodes: int = 1000,
     experiment_name: str = "",
 ):
     """Train."""
     context = Context(experiment_name, self, env, callbacks)
     context.on_training_begin()
     for episode in range(nb_episodes):
         state = env.reset()
         done = False
         step = 0
         self._episode_reset()
         self.stop = self._should_stop()
         context.on_episode_begin(episode)
         while not done and not self.done():
             action = random_policy(self, state)
             context.on_step_begin(step, action)
             state2, reward, done, info = env.step(action)
             observation = (state, action, reward, state2, done)
             self.observe(observation)
             context.on_step_end(step, observation)
             state = state2
             step += 1
         self._add_trace()
         if episode % self.update_frequency == 0:
             self._learn_pdfa()
         context.on_episode_end(episode)
     context.on_training_end()
Ejemplo n.º 18
0
def play_episode(env: gym.Env, agent: Agent, replay_memory: ReplayMemory,
                 eps: float, batch_size: int, gamma: float) -> int:
    """Play an episode and train

    Args:
        env (gym.Env): gym environment (CartPole-v0)
        agent (Agent): agent will train and get action
        replay_memory (ReplayMemory): trajectory is saved here
        eps (float):
        batch_size (int): batch size
    Returns:
        int: reward earned in this episode
    """
    state = env.reset()
    done, total_reward = False, 0
    while not done:
        a = agent.get_action(state, eps)
        state_2, reward, done, info = env.step(a)
        total_reward += reward
        if done:
            reward = -1  # Game lost, so terminal reward is -1

        replay_memory.push(state, a, reward, state_2, done)
        if len(replay_memory) > batch_size:
            minibatch = replay_memory.pop(batch_size)
            train_helper(agent=agent, minibatch=minibatch, gamma=gamma)

        state = state_2
    return total_reward
Ejemplo n.º 19
0
def rollout(
    env: gym.Env,
    nb_episodes: int = 1,
    max_steps: Optional[int] = None,
    policy=lambda env, state: _random_action(env, state),
    callback=lambda env, step: None,
):
    """
    Do a rollout.

    :param env: the OpenAI Gym environment.
    :param nb_episodes: the number of rollout episodes.
    :param max_steps: maximum number of steps per episodes.
    :param policy: a callable that takes the enviornment and the state and returns the action.
    :param callback: a callback that takes the environment and it is called at each step.
    :return: None
    """
    if max_steps:
        env = TimeLimit(env, max_episode_steps=max_steps)
    for _ in range(nb_episodes):
        state = env.reset()
        done = False
        callback(env, (state, 0.0, done, {}))
        while not done:
            action = policy(env, state)
            state, reward, done, info = env.step(action)
            callback(env, (state, reward, done, info))
Ejemplo n.º 20
0
def run_episode(environment: gym.Env, agent: DQNAgent, render: bool,
                max_length: int):
    """
    Run one episode in the given environment with the agent.

    Arguments:
        environment {`gym.Env`} -- Environment representing the Markov Decision Process
        agent {`DQNAgent`} -- Reinforcment Learning agent that acts in the envíronment
        render {`bool`} -- Whether the frames of the episode should be rendered on the screen
        max_length {`int`} -- Maximum number of steps before the episode is terminated

    Returns:
        `float` -- Cumulated reward that the agent received during the episode
    """
    episode_reward = 0
    state = environment.reset()
    for _ in range(max_length):
        if render:
            environment.render()
        action = agent.act(state)
        next_state, reward, terminal, _ = environment.step(action)
        agent.observe(
            Transition(state, action, reward,
                       None if terminal else next_state))
        episode_reward += reward
        if terminal:
            break
        else:
            state = next_state
    return episode_reward
Ejemplo n.º 21
0
def selection_phase(node: SequentialNode, state: gym.Env,
                    selection_strat: Callable[[SequentialNode, float], Dict[int, float]],
                    exploration_factor: float) \
                    -> Tuple[SequentialNode, Union[int, None]]:
    '''
    Traverses the tree starting at :param: node, following
    :param: selection_strat to decide which child (branch) to follow, updating
    the :param state environment model during traversal. This phase terminates
    when a child node (edge) is selected leading to an un-expanded child node.
    OR when a terminal node is found.

    :param node: Current node of the MCTS tree being traversed
    :param state: Environment model
    :param selection_strat: Selection policy used to select the most promising
                            child node to traverse to
    :param exploration_factor: Exploration factor for :param: selection_strat
    :returns: A node and an action leading to an un-expanded child
    '''
    if node.is_terminal:
        return node, None
    scores = selection_strat(node, exploration_factor)
    best_a_i = choice(extract_best_actions(scores))
    if best_a_i not in node.children:
        return node, best_a_i
    else:
        state.step(best_a_i)
        return selection_phase(node.children[best_a_i], state, selection_strat,
                               exploration_factor)
Ejemplo n.º 22
0
def train_episode(env: gym.Env, agent: ActorCriticAgent, memory: ReplayMemory,
                  batch_size: int, gamma: float, tau: float) -> float:
    s = env.reset()
    done = False

    total_reward = 0

    while not done:
        a = agent.perform_policy(s)
        if agent.a_space_type == "discrete":
            discrete_a = np.argmax(a)
            s2, r, done, info = env.step(discrete_a)
        else:
            s2, r, done, info = env.step(a)
        env.render()
        memory.push(s, a, r, done, s2)

        total_reward += r

        if len(memory) > batch_size:

            transition_batch = memory.sample(batch_size)
            agent.learning(transition_batch, gamma, tau)

        s = s2

        if done:
            return total_reward
Ejemplo n.º 23
0
 def _play(
     self,
     env: gym.Env,
     callbacks: Collection[Callback] = (),
     nb_episodes: int = 1000,
     experiment_name: str = "",
     is_training: bool = False,
 ):
     context = Context(experiment_name, self, env, callbacks)
     context.on_training_begin()
     for episode in range(nb_episodes):
         state = env.reset()
         done = False
         step = 0
         context.on_episode_begin(episode)
         while not done:
             action = (self.take_action(state)
                       if is_training else self.choose_best_action(state))
             context.on_step_begin(step, action)
             state2, reward, done, info = env.step(action)
             observation = (state, action, reward, state2, done)
             if is_training:
                 self.observe(observation)
             context.on_step_end(step, observation)
             state = state2
             step += 1
         context.on_episode_end(episode)
     context.on_training_end()
Ejemplo n.º 24
0
def test_agent_performance(env: gym.Env,
                           sac_params: SacEntropyAdjustmentParams,
                           run_params: RunParams, writer,
                           test_episode_number: int,
                           scaler: sklearn.preprocessing.StandardScaler):
    """ Tests the agent's performance by running the policy during a certain amount of episodes. The
    average episode reward and episode length are logged on the console and optionally on Tensorboard"""
    with torch.no_grad():
        episode_rewards, episode_lengths = [], []
        for j in range(sac_params.num_test_episodes):
            state, done, episode_reward, episode_length = env.reset(
            ), False, 0, 0
            while not done:
                state_scaled = scale_state(
                    scaler, state) if run_params.should_scale_states else state
                action = select_action_sac(
                    state_scaled, sac_params,
                    deterministic=True)  # No noise, pure exploitation
                state, reward, done, _ = env.step(action)
                episode_reward += reward
                episode_length += 1

            episode_rewards.append(episode_reward)
            episode_lengths.append(episode_length)

        print(f"\tAverage total episode reward: {np.mean(episode_rewards):.3f}"
              f"\tAverage episode length: {np.mean(episode_lengths):.3f}")

        if run_params.use_tensorboard:
            writer.add_scalar("Test Performance/Average Performance",
                              np.mean(episode_rewards), test_episode_number)
            writer.add_scalar("Test Performance/Average Episode Steps",
                              np.mean(episode_lengths), test_episode_number)
Ejemplo n.º 25
0
def test(env: gym.Env, agent: AgentBase, settings: TestSettings):
    # Initialize variables for logging.
    # agent.load(settings.directory)
    scores = ContiguousRingBuffer(capacity=128)
    eps = ConstantEpsilon(0.01)
    for i_episode in tqdm(range(settings.num_episodes)):
        # Initialize episode
        state = env.reset()
        total_reward = 0

        # Interact with the environment until done.
        done = False
        step = 0
        while not done:
            action = agent.select_action(state, eps(i_episode))
            if settings.render:
                env.render()
            next_state, reward, done, _ = env.step(action)
            total_reward += reward
            state = next_state
            time.sleep(1.0 / settings.fps)
            logger.debug('{}:{}'.format(step, action))
            step += 1

        # Save the final score.
        scores.append(total_reward)
    return scores
Ejemplo n.º 26
0
def _check_returned_values(env: gym.Env, observation_space: spaces.Space,
                           action_space: spaces.Space) -> None:
    """
    Check the returned values by the env when calling `.reset()` or `.step()` methods.
    """
    # because env inherits from gym.Env, we assume that `reset()` and `step()` methods exists
    obs = env.reset()

    if isinstance(observation_space, spaces.Dict):
        assert isinstance(
            obs,
            dict), "The observation returned by `reset()` must be a dictionary"
        for key in observation_space.spaces.keys():
            try:
                _check_obs(obs[key], observation_space.spaces[key], "reset")
            except AssertionError as e:
                raise AssertionError(f"Error while checking key={key}: " +
                                     str(e))
    else:
        _check_obs(obs, observation_space, "reset")

    # Sample a random action
    action = action_space.sample()
    data = env.step(action)

    assert (
        len(data) == 4
    ), "The `step()` method must return four values: obs, reward, done, info"

    # Unpack
    obs, reward, done, info = data

    if isinstance(observation_space, spaces.Dict):
        assert isinstance(
            obs,
            dict), "The observation returned by `step()` must be a dictionary"
        for key in observation_space.spaces.keys():
            try:
                _check_obs(obs[key], observation_space.spaces[key], "step")
            except AssertionError as e:
                raise AssertionError(f"Error while checking key={key}: " +
                                     str(e))

    else:
        _check_obs(obs, observation_space, "step")

    # We also allow int because the reward will be cast to float
    assert isinstance(
        reward,
        (float, int,
         np.float32)), "The reward returned by `step()` must be a float"
    assert isinstance(done, bool), "The `done` signal must be a boolean"
    assert isinstance(
        info,
        dict), "The `info` returned by `step()` must be a python dictionary"

    if isinstance(env, gym.GoalEnv):
        # For a GoalEnv, the keys are checked at reset
        assert reward == env.compute_reward(obs["achieved_goal"],
                                            obs["desired_goal"], info)
Ejemplo n.º 27
0
def _check_returned_values(env: gym.Env, observation_space: spaces.Space, action_space: spaces.Space) -> None:
    """
    Check the returned values by the env when calling `.reset()` or `.step()` methods.
    """
    # because env inherits from gym.Env, we assume that `reset()` and `step()` methods exists
    obs = env.reset()

    _check_obs(obs, observation_space, 'reset')

    # Sample a random action
    action = action_space.sample()
    data = env.step(action)

    assert len(data) == 4, "The `step()` method must return four values: obs, reward, done, info"

    # Unpack
    obs, reward, done, info = data

    _check_obs(obs, observation_space, 'step')

    # We also allow int because the reward will be cast to float
    assert isinstance(reward, (float, int)), "The reward returned by `step()` must be a float"
    assert isinstance(done, bool), "The `done` signal must be a boolean"
    assert isinstance(info, dict), "The `info` returned by `step()` must be a python dictionary"

    if isinstance(env, gym.GoalEnv):
        # For a GoalEnv, the keys are checked at reset
        assert reward == env.compute_reward(obs['achieved_goal'], obs['desired_goal'], info)
def iterate_batches(env: gym.Env, net: Net, batch_size: int):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)

    while True:
        # noinspection PyArgumentList
        obs_v = torch.FloatTensor([obs])

        # Get the probability distribution, sample and execute an action.
        act_probs = sm(net(obs_v)).data.numpy()[0]
        action = np.random.choice(len(act_probs), p=act_probs)
        next_obs, reward, done, _ = env.step(action)

        # Collect metrics for applying the cross-entropy method.
        episode_reward += reward
        step = EpisodeStep(observation=obs, action=action)
        episode_steps.append(step)

        if done:
            episode = Episode(reward=episode_reward, steps=episode_steps)
            batch.append(episode)
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if len(batch) == batch_size:
                yield batch
                batch = []

        obs = next_obs
Ejemplo n.º 29
0
    def get_trajectory(
        self,
        env: gym.Env,
        actor: nn.Module,
        device,
        sampler_index: int = None,
        trajectory_index: int = None,
        t_max: int = 1000,
    ) -> Trajectory:
        if sampler_index is not None:
            epsilon = float(pow(0.9996, trajectory_index + 1) / (sampler_index + 1))
        else:
            epsilon = None
        state = env.reset()
        observations, actions, rewards, dones = [], [], [], []

        for t in range(t_max):
            action = self.get_action(env, actor, state=state, epsilon=epsilon)
            next_state, reward, done, _ = env.step(action)

            observations.append(state)
            actions.append(action)
            rewards.append(reward)
            dones.append(done)

            state = next_state
            if done:
                break

        trajectory = Trajectory(observations, actions, rewards, dones)
        return trajectory
Ejemplo n.º 30
0
    def fit(self, env: gym.Env, nb_steps: int) -> None:
        """
        Train the agent on the given proxy environment.

        :param env: the gym environment in which the agent is trained
        :param nb_steps: number of training steps to be performed
        :return: None
        """
        action_counter = 0
        # for the BanditEnv example, the episode will always be 1.
        # In general that's not the case, but for completeness
        # we implemented a training loop that supports learning across many episodes.
        episode_counter = 0
        nb_steps_digits = len(str(nb_steps))

        while action_counter < nb_steps:
            env.reset()
            done = False
            episode_counter += 1
            while not done and action_counter < nb_steps:
                action = self._pick_an_action()
                action_counter += 1
                obs, reward, done, info = env.step(action)
                self._update_model(obs, reward, done, info, action)
                if action_counter % 10 == 0:
                    print(("Step {:" + str(nb_steps_digits) +
                           "}/{}, episode={}, action={:7}, reward={}").format(
                               action_counter,
                               nb_steps,
                               episode_counter,
                               str(action),
                               reward,
                           ))
        env.close()