Example #1
0
    def _restore_checkpoint(self):
        checkpointer = Checkpointer(ckpt_dir=os.path.join(
            self._train_dir, 'algorithm'),
                                    algorithm=self._algorithm,
                                    trainer_progress=self._trainer_progress)

        super()._restore_checkpoint(checkpointer)
Example #2
0
    def _restore_checkpoint(self):
        checkpointer = Checkpointer(
            ckpt_dir=os.path.join(self._train_dir, 'algorithm'),
            algorithm=self._algorithm,
            metrics=nn.ModuleList(self._algorithm.get_metrics()),
            trainer_progress=self._trainer_progress)

        super()._restore_checkpoint(checkpointer)
Example #3
0
    def test_data_buffer(self):
        dim = 20
        capacity = 256
        data_spec = (TensorSpec(shape=()), TensorSpec(shape=(dim // 3 - 1, )),
                     TensorSpec(shape=(dim - dim // 3, )))

        data_buffer = DataBuffer(data_spec=data_spec, capacity=capacity)

        def _get_batch(batch_size):
            x = torch.randn(batch_size, dim, requires_grad=True)
            x = (x[:, 0], x[:, 1:dim // 3], x[..., dim // 3:])
            return x

        data_buffer.add_batch(_get_batch(100))
        self.assertEqual(int(data_buffer.current_size), 100)
        batch = _get_batch(1000)
        # test that the created batch has gradients
        self.assertTrue(batch[0].requires_grad)
        data_buffer.add_batch(batch)
        ret = data_buffer.get_batch(2)
        # test that DataBuffer detaches gradients of inputs
        self.assertFalse(ret[0].requires_grad)
        self.assertEqual(int(data_buffer.current_size), capacity)
        ret = data_buffer.get_batch_by_indices(torch.arange(capacity))
        self.assertEqual(ret[0], batch[0][-capacity:])
        self.assertEqual(ret[1], batch[1][-capacity:])
        self.assertEqual(ret[2], batch[2][-capacity:])
        batch = _get_batch(100)
        data_buffer.add_batch(batch)
        ret = data_buffer.get_batch_by_indices(
            torch.arange(data_buffer.current_size - 100,
                         data_buffer.current_size))
        self.assertEqual(ret[0], batch[0])
        self.assertEqual(ret[1], batch[1])
        self.assertEqual(ret[2], batch[2][-capacity:])

        # Test checkpoint working
        with tempfile.TemporaryDirectory() as checkpoint_directory:
            checkpoint = Checkpointer(checkpoint_directory,
                                      data_buffer=data_buffer)
            checkpoint.save(10)
            data_buffer = DataBuffer(data_spec=data_spec, capacity=capacity)
            checkpoint = Checkpointer(checkpoint_directory,
                                      data_buffer=data_buffer)
            global_step = checkpoint.load()
            self.assertEqual(global_step, 10)

        ret = data_buffer.get_batch_by_indices(
            torch.arange(data_buffer.current_size - 100,
                         data_buffer.current_size))
        self.assertEqual(ret[0], batch[0])
        self.assertEqual(ret[1], batch[1])
        self.assertEqual(ret[2], batch[2][-capacity:])

        data_buffer.clear()
        self.assertEqual(int(data_buffer.current_size), 0)
Example #4
0
def play(root_dir,
         env,
         algorithm,
         checkpoint_step="latest",
         epsilon_greedy=0.,
         num_episodes=10,
         max_episode_length=0,
         sleep_time_per_step=0.01,
         record_file=None,
         future_steps=0,
         append_blank_frames=0,
         render=True,
         render_prediction=False,
         ignored_parameter_prefixes=[]):
    """Play using the latest checkpoint under `train_dir`.

    The following example record the play of a trained model to a mp4 video:
    .. code-block:: bash

        python -m alf.bin.play \
        --root_dir=~/tmp/bullet_humanoid/ppo2/ppo2-11 \
        --num_episodes=1 \
        --record_file=ppo_bullet_humanoid.mp4

    Args:
        root_dir (str): same as the root_dir used for `train()`
        env (AlfEnvironment): the environment
        algorithm (RLAlgorithm): the training algorithm
        checkpoint_step (int|str): the number of training steps which is used to
            specify the checkpoint to be loaded. If checkpoint_step is 'latest',
            the most recent checkpoint named 'latest' will be loaded.
        epsilon_greedy (float): a floating value in [0,1], representing the
            chance of action sampling instead of taking argmax. This can
            help prevent a dead loop in some deterministic environment like
            Breakout.
        num_episodes (int): number of episodes to play
        max_episode_length (int): if >0, each episode is limited to so many
            steps.
        sleep_time_per_step (float): sleep so many seconds for each step
        record_file (str): if provided, video will be recorded to a file
            instead of shown on the screen.
        future_steps (int): whether to encode some information from future steps
            into the current frame. If future_steps is larger than zero,
            then the related information (e.g. observation, reward, action etc.)
            will be cached and the encoding of them to video frames is deferred
            to the time when ``future_steps`` of future frames are available.
            This defer mode is potentially useful to display for each frame
            some information that expands beyond a single time step to the future.
            Currently this mode only support offline rendering, i.e. rendering
            and saving the video to ``record_file``. If a non-positive value is
            provided, it is treated as not using the defer mode and the plots
            for displaying future information will not be displayed.
        append_blank_frames (int): If >0, wil append such number of blank frames
            at the end of the episode in the rendered video file. A negative
            value has the same effects as 0 and no blank frames will be appended.
            This option has no effects when displaying the frames on the screen
            instead of recording to a file.
        render (bool): If False, then this function only evaluates the trained
            model without calling rendering functions. This value will be ignored
            if a ``record_file`` argument is provided.
        render_prediction (bool): If True, when using ``VideoRecorder`` to render
            a video, extra prediction info (returned by ``predict_step()``) will
            also be rendered by the side of video frames.
        ignored_parameter_prefixes (list[str]): ignore the parameters whose
            name has one of these prefixes in the checkpoint.
"""
    root_dir = os.path.expanduser(root_dir)
    train_dir = os.path.join(root_dir, 'train')

    ckpt_dir = os.path.join(train_dir, 'algorithm')
    checkpointer = Checkpointer(ckpt_dir=ckpt_dir, algorithm=algorithm)
    checkpointer.load(checkpoint_step,
                      ignored_parameter_prefixes=ignored_parameter_prefixes,
                      including_optimizer=False,
                      including_replay_buffer=False)

    recorder = None
    if record_file is not None:
        recorder = VideoRecorder(env,
                                 future_steps=future_steps,
                                 append_blank_frames=append_blank_frames,
                                 render_prediction=render_prediction,
                                 path=record_file)
    elif render:
        # pybullet_envs need to render() before reset() to enable mode='human'
        env.render(mode='human')
    env.reset()

    time_step = common.get_initial_time_step(env)
    algorithm.eval()
    policy_state = algorithm.get_initial_predict_state(env.batch_size)
    trans_state = algorithm.get_initial_transform_state(env.batch_size)
    episode_reward = 0.
    episode_length = 0
    episodes = 0
    metrics = [
        alf.metrics.AverageReturnMetric(buffer_size=num_episodes,
                                        reward_shape=env.reward_spec().shape),
        alf.metrics.AverageEpisodeLengthMetric(buffer_size=num_episodes),
    ]
    while episodes < num_episodes:
        time_step, policy_step, trans_state = _step(
            algorithm=algorithm,
            env=env,
            time_step=time_step,
            policy_state=policy_state,
            trans_state=trans_state,
            epsilon_greedy=epsilon_greedy,
            metrics=metrics)
        policy_state = policy_step.state
        episode_length += 1

        is_last_step = time_step.is_last() or (episode_length >=
                                               max_episode_length > 0)

        if recorder:
            recorder.capture_frame(time_step, policy_step, is_last_step)
        elif render:
            env.render(mode='human')
            time.sleep(sleep_time_per_step)

        time_step_reward = time_step.reward.view(-1).float().cpu().numpy()

        episode_reward += time_step_reward

        if is_last_step:
            logging.info("episode_length=%s episode_reward=%s" %
                         (episode_length, episode_reward))
            episode_reward = 0.
            episode_length = 0.
            episodes += 1
            # observe the last step
            for m in metrics:
                m(time_step.cpu())
            time_step = env.reset()

    for m in metrics:
        logging.info(
            "%s: %s", m.name,
            map_structure(
                lambda x: x.cpu().numpy().item()
                if x.ndim == 0 else x.cpu().numpy(), m.result()))
    if recorder:
        recorder.close()
    env.reset()
Example #5
0
def play(root_dir,
         env,
         algorithm,
         checkpoint_step="latest",
         epsilon_greedy=0.1,
         num_episodes=10,
         max_episode_length=0,
         sleep_time_per_step=0.01,
         record_file=None,
         ignored_parameter_prefixes=[]):
    """Play using the latest checkpoint under `train_dir`.

    The following example record the play of a trained model to a mp4 video:
    .. code-block:: bash

        python -m alf.bin.play \
        --root_dir=~/tmp/bullet_humanoid/ppo2/ppo2-11 \
        --num_episodes=1 \
        --record_file=ppo_bullet_humanoid.mp4

    Args:
        root_dir (str): same as the root_dir used for `train()`
        env (AlfEnvironment): the environment
        algorithm (RLAlgorithm): the training algorithm
        checkpoint_step (int|str): the number of training steps which is used to
            specify the checkpoint to be loaded. If checkpoint_step is 'latest',
            the most recent checkpoint named 'latest' will be loaded.
        epsilon_greedy (float): a floating value in [0,1], representing the
            chance of action sampling instead of taking argmax. This can
            help prevent a dead loop in some deterministic environment like
            Breakout.
        num_episodes (int): number of episodes to play
        max_episode_length (int): if >0, each episode is limited to so many
            steps.
        sleep_time_per_step (float): sleep so many seconds for each step
        record_file (str): if provided, video will be recorded to a file
            instead of shown on the screen.
        ignored_parameter_prefixes (list[str]): ignore the parameters whose
            name has one of these prefixes in the checkpoint.
"""
    root_dir = os.path.expanduser(root_dir)
    train_dir = os.path.join(root_dir, 'train')

    ckpt_dir = os.path.join(train_dir, 'algorithm')
    checkpointer = Checkpointer(ckpt_dir=ckpt_dir, algorithm=algorithm)
    checkpointer.load(
        checkpoint_step,
        ignored_parameter_prefixes=ignored_parameter_prefixes,
        including_optimizer=False,
        including_replay_buffer=False)

    recorder = None
    if record_file is not None:
        recorder = VideoRecorder(env, path=record_file)
    else:
        # pybullet_envs need to render() before reset() to enable mode='human'
        env.render(mode='human')
    env.reset()

    time_step = common.get_initial_time_step(env)
    algorithm.eval()
    policy_state = algorithm.get_initial_predict_state(env.batch_size)
    trans_state = algorithm.get_initial_transform_state(env.batch_size)
    episode_reward = 0.
    episode_length = 0
    episodes = 0
    metrics = [
        alf.metrics.AverageReturnMetric(
            buffer_size=num_episodes, reward_shape=env.reward_spec().shape),
        alf.metrics.AverageEpisodeLengthMetric(buffer_size=num_episodes),
    ]
    while episodes < num_episodes:
        time_step, policy_state, trans_state, info = _step(
            algorithm=algorithm,
            env=env,
            time_step=time_step,
            policy_state=policy_state,
            trans_state=trans_state,
            epsilon_greedy=epsilon_greedy,
            metrics=metrics)
        episode_length += 1
        if recorder:
            recorder.capture_frame(info)
        else:
            env.render(mode='human')
            time.sleep(sleep_time_per_step)

        time_step_reward = time_step.reward.view(-1).float().cpu().numpy()

        episode_reward += time_step_reward

        if time_step.is_last() or episode_length >= max_episode_length > 0:
            logging.info("episode_length=%s episode_reward=%s" %
                         (episode_length, episode_reward))
            episode_reward = 0.
            episode_length = 0.
            episodes += 1
            # observe the last step
            for m in metrics:
                m(time_step.cpu())
            time_step = env.reset()

    for m in metrics:
        logging.info("%s: %f", m.name, m.result())
    if recorder:
        recorder.close()
    env.reset()