Esempio n. 1
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Collect samples for the given iteration number.

        Args:
            itr (int): Number of iteration.
            batch_size (int): Number of environment steps in one batch.
            whole_paths (bool): Whether to use whole path or truncated.

        Returns:
            list[dict]: A list of paths.

        """
        if not batch_size:
            batch_size = self.algo.max_path_length * self.n_envs

        cur_policy_params = self.algo.policy.get_param_values()
        paths = parallel_sampler.sample_paths(
            policy_params=cur_policy_params,
            max_samples=batch_size,
            max_path_length=self.algo.max_path_length,
            scope=self.algo.scope,
        )
        if whole_paths:
            return paths
        else:
            paths_truncated = truncate_paths(paths, batch_size)
            return paths_truncated
Esempio n. 2
0
    def test_truncate_paths(self):
        paths = [
            dict(
                observations=np.zeros((100, 1)),
                actions=np.zeros((100, 1)),
                rewards=np.zeros(100),
                env_infos=dict(),
                agent_infos=dict(lala=np.zeros(100)),
            ),
            dict(
                observations=np.zeros((50, 1)),
                actions=np.zeros((50, 1)),
                rewards=np.zeros(50),
                env_infos=dict(),
                agent_infos=dict(lala=np.zeros(50)),
            ),
        ]

        truncated = truncate_paths(paths, 130)
        assert len(truncated) == 2
        assert len(truncated[-1]['observations']) == 30
        assert len(truncated[0]['observations']) == 100
        # make sure not to change the original one
        assert len(paths) == 2
        assert len(paths[-1]['observations']) == 50
Esempio n. 3
0
    def _obtain_is_samples(self, _itr, batch_size=None, whole_paths=True):
        """Collect IS samples for the given iteration number.

        Args:
            _itr (int): Number of iteration.
            batch_size (int): Number of batch size.
            whole_paths (bool): Whether to use whole path or truncated.

        Returns:
            list: A list of paths.

        """
        if batch_size is None:
            batch_size = self.algo.max_path_length

        paths = []
        for hist_policy_distribution, hist_paths in self.get_history_list(
                self.n_backtrack):
            h_paths = self._sample_isweighted_paths(
                policy=self.algo.policy,
                hist_policy_distribution=hist_policy_distribution,
                max_samples=batch_size,
                paths=hist_paths,
                hist_variance_penalty=self.hist_variance_penalty,
                max_is_ratio=self.max_is_ratio,
                ess_threshold=self.ess_threshold,
            )
            paths.extend(h_paths)

        if len(paths) > batch_size:
            paths = random.sample(paths, batch_size)

        return paths if whole_paths else truncate_paths(paths, batch_size)
 def test_truncates(self):
     truncated = utils.truncate_paths(self.paths, 130)
     assert len(truncated) == 2
     assert len(truncated[-1]['observations']) == 30
     assert len(truncated[0]['observations']) == 100
     # make sure not to change the original one
     assert len(self.paths) == 2
     assert len(self.paths[-1]['observations']) == 50
Esempio n. 5
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Sample the policy for new trajectories.

        Args:
            itr (int): Number of iteration.
            batch_size (int): Number of environment steps in one batch.
            whole_paths (bool): Whether to use whole path or truncated.

        Returns:
            list[dict]: A list of paths.

        """
        if not batch_size:
            batch_size = self.algo.max_path_length

        cur_params = self.algo.policy.get_param_values()
        paths = parallel_sampler.sample_paths(
            policy_params=cur_params,
            max_samples=batch_size,
            max_path_length=self.algo.max_path_length,
            scope=self.algo.scope,
        )

        return paths if whole_paths else truncate_paths(paths, batch_size)
Esempio n. 6
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Sample the policy for new trajectories.

        If batch size is not specified, episode per task by default is 1 so
        batch size will be meta_batch_size * max_path_length.

        When number of workers are less than meta batch size, sampling will
        be performed for each of self._vec_envs_indices in series. The
        i-th value of self._vec_envs_indices represents the indices of the
        environments/tasks to be sampled for the i-th iteration.

        Args:
            itr (int): Iteration number.
            batch_size (int): Number of samples to be collected. If None,
                it will be default [algo.max_path_length * n_envs].
            whole_paths (bool): Whether return all the paths or not. True
                by default. It's possible for the paths to have total actual
                sample size larger than batch_size, and will be truncated if
                this flag is true.

        Returns:
            OrderedDict: Sample paths. Key represents the index of the
                environment/task and value represents all the paths sampled
                from that particular environment/task.


        Note:
            Each path is a dictionary, with keys and values as following:
                * observations: numpy.ndarray with shape :math:`[N, S^*]`
                * actions: numpy.ndarray with shape :math:`[N, S^*]`
                * rewards: numpy.ndarray with shape :math:`[N, S^*]`
                * dones: numpy.ndarray with shape :math:`[N, S^*]`
                * env_infos: A dictionary with each key representing one
                  environment info, value being a numpy.ndarray with shape
                  :math:`[N, S^*]`. One example is "ale.lives" for atari
                  environments.
                * agent_infos: A dictionary with each key representing one
                  agent info, value being a numpy.ndarray with shape
                  :math:`[N, S^*]`. One example is "prev_action", which is used
                  for recurrent policy as previous action input, merged with
                  the observation input as the state input.

        """
        logger.log('Obtaining samples for iteration %d...' % itr)

        if batch_size is None:
            batch_size = self.algo.max_path_length * self._meta_batch_size

        paths = []

        tasks = self.env.sample_tasks(self._meta_batch_size)

        # Start main loop
        batch_size_per_loop = batch_size // len(self._vec_envs_indices)
        for vec_envs_indices in self._vec_envs_indices:
            self._setup_worker(vec_envs_indices, tasks)

            n_samples = 0
            obses = self._vec_env.reset()
            dones = np.asarray([True] * self._vec_env.num_envs)
            running_paths = [None] * self._vec_env.num_envs

            pbar = ProgBarCounter(batch_size)
            policy_time = 0
            env_time = 0
            process_time = 0

            policy = self.algo.policy
            # Only reset policies at the beginning of a meta batch
            policy.reset(dones)

            while n_samples < batch_size_per_loop:
                t = time.time()

                actions, agent_infos = policy.get_actions(obses)

                policy_time += time.time() - t
                t = time.time()
                next_obses, rewards, dones, env_infos = self._vec_env.step(
                    actions)
                env_time += time.time() - t
                t = time.time()

                agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
                env_infos = tensor_utils.split_tensor_dict_list(env_infos)
                if env_infos is None:
                    env_infos = [dict() for _ in range(self._vec_env.num_envs)]
                if agent_infos is None:
                    agent_infos = [
                        dict() for _ in range(self._vec_env.num_envs)
                    ]
                for idx, observation, action, reward, env_info, agent_info, done in zip(  # noqa: E501
                        itertools.count(), obses, actions, rewards, env_infos,
                        agent_infos, dones):
                    if running_paths[idx] is None:
                        running_paths[idx] = dict(
                            observations=[],
                            actions=[],
                            rewards=[],
                            dones=[],
                            env_infos=[],
                            agent_infos=[],
                        )
                    running_paths[idx]['observations'].append(observation)
                    running_paths[idx]['actions'].append(action)
                    running_paths[idx]['rewards'].append(reward)
                    running_paths[idx]['dones'].append(done)
                    running_paths[idx]['env_infos'].append(env_info)
                    running_paths[idx]['agent_infos'].append(agent_info)
                    if done:
                        obs = np.asarray(running_paths[idx]['observations'])
                        actions = np.asarray(running_paths[idx]['actions'])
                        paths.append(
                            dict(observations=obs,
                                 actions=actions,
                                 rewards=np.asarray(
                                     running_paths[idx]['rewards']),
                                 dones=np.asarray(running_paths[idx]['dones']),
                                 env_infos=tensor_utils.stack_tensor_dict_list(
                                     running_paths[idx]['env_infos']),
                                 agent_infos=tensor_utils.
                                 stack_tensor_dict_list(
                                     running_paths[idx]['agent_infos']),
                                 batch_idx=idx))
                        n_samples += len(running_paths[idx]['rewards'])
                        running_paths[idx] = None

                process_time += time.time() - t
                pbar.inc(len(obses))
                obses = next_obses

        pbar.stop()

        tabular.record('PolicyExecTime', policy_time)
        tabular.record('EnvExecTime', env_time)
        tabular.record('ProcessExecTime', process_time)

        return paths if whole_paths else truncate_paths(paths, batch_size)
 def test_invalid_path(self):
     self.paths[0]['invalid'] = None
     with pytest.raises(ValueError):
         utils.truncate_paths(self.paths, 3)
 def test_truncates_dict(self):
     truncated = utils.truncate_paths(self.paths_dict, 130)
     assert len(truncated) == 2
     assert len(truncated[-1]['agent_infos']['lala']['baba']) == 30
     assert len(truncated[0]['agent_infos']['lala']['baba']) == 100
Esempio n. 9
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Sample the policy for new trajectories.

        Args:
            itr (int): Iteration number.
            batch_size (int): Number of samples to be collected. If None,
                it will be default [algo.max_path_length * n_envs].
            whole_paths (bool): Whether return all the paths or not. True
                by default. It's possible for the paths to have total actual
                sample size larger than batch_size, and will be truncated if
                this flag is true.

        Returns:
            list[dict]: Sample paths.

        Note:
            Each path is a dictionary, with keys and values as following:
                * observations: numpy.ndarray with shape [Batch, *obs_dims]
                * actions: numpy.ndarray with shape [Batch, *act_dims]
                * rewards: numpy.ndarray with shape [Batch, ]
                * env_infos: A dictionary with each key representing one
                  environment info, value being a numpy.ndarray with shape
                  [Batch, ?]. One example is "ale.lives" for atari
                  environments.
                * agent_infos: A dictionary with each key representing one
                  agent info, value being a numpy.ndarray with shape
                  [Batch, ?]. One example is "prev_action", which is used
                  for recurrent policy as previous action input, merged with
                  the observation input as the state input.
                * dones: numpy.ndarray with shape [Batch, ]

        """
        logger.log('Obtaining samples for iteration %d...' % itr)

        if not batch_size:
            batch_size = self.algo.max_path_length * self._n_envs

        paths = []
        n_samples = 0
        obses = self._vec_env.reset()
        dones = np.asarray([True] * self._vec_env.num_envs)
        running_paths = [None] * self._vec_env.num_envs

        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy

        with click.progressbar(length=batch_size, label='Sampling') as pbar:
            while n_samples < batch_size:
                t = time.time()
                policy.reset(dones)

                actions, agent_infos = policy.get_actions(obses)

                policy_time += time.time() - t
                t = time.time()
                next_obses, rewards, dones, env_infos = \
                    self._vec_env.step(actions)
                env_time += time.time() - t
                t = time.time()

                agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
                env_infos = tensor_utils.split_tensor_dict_list(env_infos)
                if env_infos is None:
                    env_infos = [dict() for _ in range(self._vec_env.num_envs)]
                if agent_infos is None:
                    agent_infos = [
                        dict() for _ in range(self._vec_env.num_envs)
                    ]
                for idx, observation, action, reward, env_info, agent_info, done in zip(  # noqa: E501
                        itertools.count(), obses, actions, rewards, env_infos,
                        agent_infos, dones):
                    if running_paths[idx] is None:
                        running_paths[idx] = dict(observations=[],
                                                  actions=[],
                                                  rewards=[],
                                                  env_infos=[],
                                                  agent_infos=[],
                                                  dones=[])
                    running_paths[idx]['observations'].append(observation)
                    running_paths[idx]['actions'].append(action)
                    running_paths[idx]['rewards'].append(reward)
                    running_paths[idx]['env_infos'].append(env_info)
                    running_paths[idx]['agent_infos'].append(agent_info)
                    running_paths[idx]['dones'].append(done)
                    if done:
                        obs = np.asarray(running_paths[idx]['observations'])
                        actions = np.asarray(running_paths[idx]['actions'])
                        paths.append(
                            dict(observations=obs,
                                 actions=actions,
                                 rewards=np.asarray(
                                     running_paths[idx]['rewards']),
                                 env_infos=tensor_utils.stack_tensor_dict_list(
                                     running_paths[idx]['env_infos']),
                                 agent_infos=tensor_utils.
                                 stack_tensor_dict_list(
                                     running_paths[idx]['agent_infos']),
                                 dones=np.asarray(
                                     running_paths[idx]['dones'])))
                        n_samples += len(running_paths[idx]['rewards'])
                        running_paths[idx] = None

                process_time += time.time() - t
                pbar.update(len(obses))
                obses = next_obses

        tabular.record('PolicyExecTime', policy_time)
        tabular.record('EnvExecTime', env_time)
        tabular.record('ProcessExecTime', process_time)

        return paths if whole_paths else truncate_paths(paths, batch_size)