Exemple #1
0
    def _stack_path_data(self, paths):
        max_path = max([len(path['observations']) for path in paths])

        observations = self._stack_padding(paths, 'observations', max_path)
        actions = self._stack_padding(paths, 'actions', max_path)
        rewards = self._stack_padding(paths, 'rewards', max_path)
        dones = self._stack_padding(paths, 'dones', max_path)
        returns = self._stack_padding(paths, 'returns', max_path)
        advantages = self._stack_padding(paths, 'advantages', max_path)
        env_infos = utils.stack_tensor_dict_list(
            [path["env_infos"] for path in paths], max_path)
        agent_infos = utils.stack_tensor_dict_list(
            [path["agent_infos"] for path in paths], max_path)

        return observations, actions, rewards, dones, returns, advantages, env_infos, agent_infos
    def _pad_and_stack_paths(self, paths):
        mask, observations, actions, rewards, baselines, adjusted_rewards, env_infos, agent_infos = [], [], [], [], [], [], [], []
        for path in paths:
            # zero-pad paths if they don't have full length +  create mask
            path_length = path["observations"].shape[0]
            assert self.max_path_length >= path_length

            mask.append(self._pad(np.ones(path_length), path_length))
            observations.append(self._pad(path["observations"], path_length))
            actions.append(self._pad(path["actions"], path_length))
            rewards.append(self._pad(path["discounted_rewards"], path_length))
            baselines.append(self._pad(path["baselines"], path_length))
            adjusted_rewards.append(
                self._pad(path["adjusted_rewards"], path_length))
            env_infos.append(
                dict([(key, self._pad(array, path_length))
                      for key, array in path["env_infos"].items()]))
            agent_infos.append(
                (dict([(key, self._pad(array, path_length))
                       for key, array in path["agent_infos"].items()])))

        # stack
        mask = np.stack(mask, axis=0)  # shape: (batch_size, max_path_length)
        observations = np.stack(
            observations,
            axis=0)  # shape: (batch_size, max_path_length, ndim_act)
        actions = np.stack(
            actions, axis=0)  # shape: (batch_size, max_path_length, ndim_obs)
        rewards = np.stack(rewards,
                           axis=0)  # shape: (batch_size, max_path_length)
        baselines = np.stack(baselines, axis=0)
        adjusted_rewards = np.stack(
            adjusted_rewards, axis=0)  # shape: (batch_size, max_path_length)
        env_infos = utils.stack_tensor_dict_list(
            env_infos
        )  # dict of ndarrays of shape: (batch_size, max_path_length, ?)
        agent_infos = utils.stack_tensor_dict_list(
            agent_infos
        )  # dict of ndarrays of shape: (batch_size, max_path_length, ?)

        return mask, observations, actions, rewards, baselines, adjusted_rewards, env_infos, agent_infos
Exemple #3
0
    def obtain_samples(self, log=False, log_prefix='', random=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns: 
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = OrderedDict()
        for i in range(self.meta_batch_size):
            paths[i] = []

        n_samples = 0
        running_paths = [
            _get_empty_running_paths_dict()
            for _ in range(self.vec_env.num_envs)
        ]

        pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy
        policy.reset(dones=[True] * self.meta_batch_size)

        # initial reset of meta_envs
        obses = self.vec_env.reset()

        while n_samples < self.total_samples:

            # execute policy
            t = time.time()
            obs_per_task = np.split(np.asarray(obses), self.meta_batch_size)
            if random:
                actions = np.stack([[self.env.action_space.sample()]
                                    for _ in range(len(obses))],
                                   axis=0)
                agent_infos = [[{
                    'mean':
                    np.zeros_like(self.env.action_space.sample()),
                    'log_std':
                    np.zeros_like(self.env.action_space.sample())
                }] * self.envs_per_task] * self.meta_batch_size
            else:
                actions, agent_infos = policy.get_actions(obs_per_task)
            policy_time += time.time() - t

            # step environments
            t = time.time()
            actions = np.concatenate(actions)  # stack meta batch
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            #  stack agent_infos and if no infos were provided (--> None) create empty dicts
            agent_infos, env_infos = self._handle_info_dicts(
                agent_infos, env_infos)

            new_samples = 0
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                # append new samples to running paths
                if isinstance(reward, np.ndarray):
                    reward = reward[0]
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["dones"].append(done)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)

                # if running path is done, add it to paths and empty the running path
                if done:
                    paths[idx // self.envs_per_task].append(
                        dict(
                            observations=np.asarray(
                                running_paths[idx]["observations"]),
                            actions=np.asarray(running_paths[idx]["actions"]),
                            rewards=np.asarray(running_paths[idx]["rewards"]),
                            dones=np.asarray(running_paths[idx]["dones"]),
                            env_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    new_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = _get_empty_running_paths_dict()

            pbar.update(new_samples)
            n_samples += new_samples
            obses = next_obses
        pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
    def obtain_samples(self, log=False, log_prefix='', random=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns: 
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = OrderedDict()
        for i in range(self.meta_batch_size):
            paths[i] = []

        running_paths = _get_empty_running_paths_dict()

        pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy

        for idx in range(self.meta_batch_size):
            ts = 0
            n_samples = 0

            init_obs = np.expand_dims(self.env.reset(), 0).copy()
            obses = [init_obs for _ in range(self.meta_batch_size)]
            policy.reset(dones=[True] * self.meta_batch_size)
            while n_samples < self.samples_per_task:
                # execute policy
                t = time.time()

                if random:
                    actions = np.stack([[self.env.action_space.sample()]
                                        for _ in range(len(obses))],
                                       axis=0)
                    agent_infos = [[{
                        'mean':
                        np.zeros_like(self.env.action_space.sample()),
                        'log_std':
                        np.zeros_like(self.env.action_space.sample())
                    }] * self.envs_per_task] * self.meta_batch_size
                else:
                    actions, agent_infos = policy.get_actions(obses)

                policy_time += time.time() - t

                # step environments
                t = time.time()
                action, agent_info = actions[idx][0], agent_infos[idx][0]
                observation = obses[idx][0].copy()

                next_obs, reward, done, env_info = self.env.step(action)

                ts += 1
                done = done or ts >= self.max_path_length
                if done:
                    next_obs = self.env.reset()
                    # time.sleep(1)
                    ts = 0

                env_time += time.time() - t

                new_samples = 0
                # append new samples to running paths
                if isinstance(reward, np.ndarray):
                    reward = reward[0]
                running_paths["observations"].append(observation)
                running_paths["actions"].append(action)
                running_paths["rewards"].append(reward)
                running_paths["dones"].append(done)
                running_paths["env_infos"].append(env_info)
                running_paths["agent_infos"].append(agent_info)

                # if running path is done, add it to paths and empty the running path
                if done:
                    paths[idx].append(
                        dict(
                            observations=np.asarray(
                                running_paths["observations"]),
                            actions=np.asarray(running_paths["actions"]),
                            rewards=np.asarray(running_paths["rewards"]),
                            dones=np.asarray(running_paths["dones"]),
                            env_infos=utils.stack_tensor_dict_list(
                                running_paths["env_infos"]),
                            agent_infos=utils.stack_tensor_dict_list(
                                running_paths["agent_infos"]),
                        ))
                    new_samples += len(running_paths["rewards"])
                    running_paths = _get_empty_running_paths_dict()

                pbar.update(new_samples)
                n_samples += new_samples
                obses[idx][0] = next_obs

            self.total_timesteps_sampled += n_samples

        pbar.stop()
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
Exemple #5
0
    def obtain_samples(self, log=False, log_prefix='', random=False, deterministic=False, sinusoid=False,
                       verbose=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = []

        n_samples = 0
        running_paths = [_get_empty_running_paths_dict() for _ in range(self.vec_env.num_envs)]

        if verbose: pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy
        policy.reset(dones=[True] * self.vec_env.num_envs)

        # initial reset of meta_envs
        obses = np.asarray(self.vec_env.reset())

        while n_samples < self.total_samples:

            # execute policy
            t = time.time()
            if self.vae is not None:
                obses = np.array(obses)
                obses = self.vae.encode(obses)
            if random:
                actions = np.stack([self.env.action_space.sample() for _ in range(self.vec_env.num_envs)], axis=0)
                agent_infos = {}
            elif deterministic:
                actions, agent_infos = policy.get_actions(obses)
                actions = [a_i['mean'] for a_i in agent_infos]
            elif sinusoid:
                action_space = self.env.action_space.shape[0]
                num_envs = self.vec_env.num_envs
                actions = np.stack([policy.get_sinusoid_actions(action_space, t/policy.horizon * 2 * np.pi) for _ in range(num_envs)], axis=0)
                agent_infos = dict()
            else:
                obses = np.array(obses)
                actions, agent_infos = policy.get_actions(obses)
            policy_time += time.time() - t

            # step environments
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            #  stack agent_infos and if no infos were provided (--> None) create empty dicts
            agent_infos, env_infos = self._handle_info_dicts(agent_infos, env_infos)

            new_samples = 0
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                # append new samples to running paths
                if isinstance(reward, np.ndarray):
                    reward = reward[0]
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["dones"].append(done)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)

                # if running path is done, add it to paths and empty the running path
                if done:
                    paths.append(dict(
                        observations=np.asarray(running_paths[idx]["observations"]),
                        actions=np.asarray(running_paths[idx]["actions"]),
                        rewards=np.asarray(running_paths[idx]["rewards"]),
                        dones=np.asarray(running_paths[idx]["dones"]),
                        env_infos=utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    new_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = _get_empty_running_paths_dict()

            if verbose: pbar.update(self.vec_env.num_envs)
            n_samples += new_samples
            obses = next_obses
        if verbose: pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "TimeStepsCtr", self.total_timesteps_sampled)
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
Exemple #6
0
    def obtain_samples(self, log=False, log_prefix='', random=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = []

        n_samples = 0
        running_paths = _get_empty_running_paths_dict()

        if log: pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy
        policy.reset(dones=[True])

        # initial reset of meta_envs
        obs = np.asarray(self.env.reset())

        ts = 0

        while n_samples < self.total_samples:

            # execute policy
            t = time.time()
            if random:
                action = self.env.action_space.sample()
                agent_info = {}
            else:
                action, agent_info = policy.get_action(obs)
                if action.ndim == 2:
                    action = action[0]
            policy_time += time.time() - t

            # step environments
            t = time.time()
            next_obs, reward, done, env_info = self.env.step(action)

            ts += 1
            done = done or ts >= self.max_path_length
            if done:
                next_obs = self.env.reset()
                ts = 0

            env_time += time.time() - t

            new_samples = 0

            # append new samples to running paths
            if isinstance(reward, np.ndarray):
                reward = reward[0]
            running_paths["observations"].append(obs)
            running_paths["actions"].append(action)
            running_paths["rewards"].append(reward)
            running_paths["dones"].append(done)
            running_paths["env_infos"].append(env_info)
            running_paths["agent_infos"].append(agent_info)

            # if running path is done, add it to paths and empty the running path
            if done:
                paths.append(
                    dict(
                        observations=np.asarray(running_paths["observations"]),
                        actions=np.asarray(running_paths["actions"]),
                        rewards=np.asarray(running_paths["rewards"]),
                        dones=np.asarray(running_paths["dones"]),
                        env_infos=utils.stack_tensor_dict_list(
                            running_paths["env_infos"]),
                        agent_infos=utils.stack_tensor_dict_list(
                            running_paths["agent_infos"]),
                    ))
                new_samples += len(running_paths["rewards"])
                running_paths = _get_empty_running_paths_dict()

            if log: pbar.update(new_samples)
            n_samples += new_samples
            obs = next_obs
        if log: pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
Exemple #7
0
from asynch_mb.samplers.base import SampleProcessor