Ejemplo n.º 1
0
    def evaluate(self,
                 policy,
                 n_episodes=1,
                 initial_state=None,
                 transform_to_internal_state=None,
                 render=False,
                 results_dir='/tmp/'):
        """
        Evaluate a policy on the mdp.

        :param policy: a policy outputing an action
        :type policy: Policy
        :param n_episodes: number of episodes to evaluate
        :type n_episodes: int
        :param initial_state: the initial state to evaluate from
        :type initial_state: int
        :param transform_to_internal_state: transform the states or initial state to an internal state of the mdp
        :type transform_to_internal_state: function
        :param render: reder the environment
        :type render: bool
        :return: a Dataset with the trajectories
        """
        dataset = Dataset(results_dir=results_dir)
        for episode in range(n_episodes):
            trajectory = []
            state = self._env.reset()
            if initial_state is not None:
                state = initial_state
                if self._quanser_robots:
                    self._env.env._sim_state = np.copy(
                        transform_to_internal_state(state))
                else:
                    self._env.env.state = np.copy(
                        transform_to_internal_state(state))
            for j in range(self._env._max_episode_steps):
                with torch.no_grad():
                    state = torch.tensor(state,
                                         device=policy.device,
                                         dtype=TORCH_DTYPE)
                    action = policy(state).to('cpu').numpy().reshape((-1, ))
                    state_next, rew, done, _ = self._env.step(action)
                    trajectory.append((state.to('cpu').numpy(), action, rew,
                                       state_next, done))
                    state = state_next
                    if render:
                        self._env.render()
                    if done:
                        break
            dataset.add_trajectory(trajectory)
        self._env.close()
        return dataset
Ejemplo n.º 2
0
         kwargs={
             'fs': 200.0,
             'fs_ctrl': 200.0
         })

env = GentlyTerminating(gym.make('Qube-100-v1'))

dataset = Dataset()
n_trajectories = 1
for traj in range(n_trajectories):
    trajectory = []
    ctrl = SwingUpCtrl()
    obs = env.reset()
    done = False
    while not done:
        env.render()
        act = ctrl(obs)
        obs_n, r, done, _ = env.step(act)
        trajectory.append((obs, act, r, obs_n, done))
        if done:
            break
        obs = np.copy(obs_n)
    env.close()
    dataset.add_trajectory(trajectory)

dataset.update_dataset_internal()
print(np.sum(dataset._rewards))

filename = "../datasets/qube/{}_trajectories.npy".format(n_trajectories)
dataset.save_trajectories_to_file(filename)