コード例 #1
0
    def obtain_samples(self, itr, log=True, log_prefix='', show_pbar=True):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            show_pbar (boolean): whether to show progress bar

        Returns: 
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = OrderedDict()
        for i in range(self.meta_batch_size):
            paths[i] = []

        n_samples = 0
        running_paths = [_get_empty_running_paths_dict() for _ in range(self.vec_env.num_envs)]

        if show_pbar:
            pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy

        # initial reset of envs
        obses = self.vec_env.reset()
        
        while n_samples < self.total_samples:
            
            # execute policy
            t = time.time()
            obs_per_task = np.split(np.asarray(obses), self.meta_batch_size)

            actions, agent_infos = policy.get_actions(obs_per_task)
            policy_time += time.time() - t

            # step environments
            t = time.time()
            actions = np.concatenate(actions) # stack meta batch
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            #  stack agent_infos and if no infos were provided (--> None) create empty dicts
            agent_infos, env_infos = self._handle_info_dicts(agent_infos, env_infos)

            new_samples = 0
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                # append new samples to running paths
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["dones"].append(int(done))
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)

                # if running path is done, add it to paths and empty the running path
                if done:
                    paths[idx // self.envs_per_task].append(dict(
                        observations=np.asarray(running_paths[idx]["observations"]),
                        actions=np.asarray(running_paths[idx]["actions"]),
                        rewards=np.asarray(running_paths[idx]["rewards"]),
                        dones=np.asarray(running_paths[idx]["dones"], dtype=np.float),
                        env_infos=utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    new_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = _get_empty_running_paths_dict()
            if show_pbar:
                pbar.update(new_samples)
            n_samples += new_samples
            obses = next_obses
        if show_pbar:
            pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            tabular.record(log_prefix + "PolicyExecTime", policy_time)
            tabular.record(log_prefix + "EnvExecTime", env_time)

        return paths
コード例 #2
0
    def obtain_samples(self, log=False, log_prefix='', random=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns: 
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = OrderedDict()
        for i in range(self.meta_batch_size):
            paths[i] = []

        n_samples = 0
        running_paths = [
            _get_empty_running_paths_dict()
            for _ in range(self.vec_env.num_envs)
        ]

        pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy
        policy.reset(dones=[True] * self.meta_batch_size)

        # initial reset of meta_envs
        obses = self.vec_env.reset()

        while n_samples < self.total_samples:

            # execute policy
            t = time.time()
            obs_per_task = np.split(np.asarray(obses), self.meta_batch_size)
            if random:
                actions = np.stack([[self.env.action_space.sample()]
                                    for _ in range(len(obses))],
                                   axis=0)
                agent_infos = [[{
                    'mean':
                    np.zeros_like(self.env.action_space.sample()),
                    'log_std':
                    np.zeros_like(self.env.action_space.sample())
                }] * self.envs_per_task] * self.meta_batch_size
            else:
                actions, agent_infos = policy.get_actions(obs_per_task)
            policy_time += time.time() - t

            # step environments
            t = time.time()
            actions = np.concatenate(actions)  # stack meta batch
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            #  stack agent_infos and if no infos were provided (--> None) create empty dicts
            agent_infos, env_infos = self._handle_info_dicts(
                agent_infos, env_infos)

            new_samples = 0
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                # append new samples to running paths
                if isinstance(reward, np.ndarray):
                    reward = reward[0]
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["dones"].append(done)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)

                # if running path is done, add it to paths and empty the running path
                if done:
                    paths[idx // self.envs_per_task].append(
                        dict(
                            observations=np.asarray(
                                running_paths[idx]["observations"]),
                            actions=np.asarray(running_paths[idx]["actions"]),
                            rewards=np.asarray(running_paths[idx]["rewards"]),
                            dones=np.asarray(running_paths[idx]["dones"]),
                            env_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    new_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = _get_empty_running_paths_dict()

            pbar.update(new_samples)
            n_samples += new_samples
            obses = next_obses
        pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
コード例 #3
0
    def obtain_samples(self, log=False, log_prefix='', random=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (list): A list of dicts with the samples
        """

        # initial setup / preparation
        paths = []

        n_samples = 0
        num_envs = self.vec_env.num_envs
        running_paths = [
            _get_empty_running_paths_dict() for _ in range(num_envs)
        ]

        pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy
        policy.reset(dones=[True] * self.vec_env.num_envs)

        # initial reset of meta_envs
        obses = np.asarray(self.vec_env.reset())

        while n_samples < self.total_samples:

            # execute policy
            t = time.time()
            if random:
                actions = np.stack(
                    [self.env.action_space.sample() for _ in range(num_envs)],
                    axis=0)
                agent_infos = {}
            else:
                a_bs = self.adapt_batch_size
                if a_bs is not None and len(
                        running_paths[0]['observations']) > a_bs + 1:
                    adapt_obs = [
                        np.stack(running_paths[idx]['observations'][-a_bs -
                                                                    1:-1])
                        for idx in range(num_envs)
                    ]
                    adapt_act = [
                        np.stack(running_paths[idx]['actions'][-a_bs - 1:-1])
                        for idx in range(num_envs)
                    ]
                    adapt_next_obs = [
                        np.stack(running_paths[idx]['observations'][-a_bs:])
                        for idx in range(num_envs)
                    ]
                    policy.dynamics_model.switch_to_pre_adapt()
                    policy.dynamics_model.adapt(adapt_obs, adapt_act,
                                                adapt_next_obs)
                actions, agent_infos = policy.get_actions(obses)
            policy_time += time.time() - t

            # step environments
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            #  stack agent_infos and if no infos were provided (--> None) create empty dicts
            agent_infos, env_infos = self._handle_info_dicts(
                agent_infos, env_infos)

            new_samples = 0
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                # append new samples to running paths
                if isinstance(reward, np.ndarray):
                    reward = reward[0]
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["dones"].append(done)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)

                # if running path is done, add it to paths and empty the running path
                if done:
                    paths.append(
                        dict(
                            observations=np.asarray(
                                running_paths[idx]["observations"]),
                            actions=np.asarray(running_paths[idx]["actions"]),
                            rewards=np.asarray(running_paths[idx]["rewards"]),
                            dones=np.asarray(running_paths[idx]["dones"]),
                            env_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    new_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = _get_empty_running_paths_dict()

            pbar.update(self.vec_env.num_envs)
            n_samples += new_samples
            obses = next_obses
        pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
コード例 #4
0
    def obtain_samples(self,
                       log=False,
                       log_prefix='',
                       random=False,
                       deterministic=False,
                       eval=False,
                       multiple_trajectory=1,
                       dynamics_model=None):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        multiple_trajectories = []

        for _ in range(multiple_trajectory):
            paths = []
            n_samples = 0
            running_paths = _get_empty_running_paths_dict()

            if log: pbar = ProgBar(self.total_samples)
            policy_time, env_time = 0, 0

            policy = self.policy
            policy.reset(dones=[True])

            # initial reset of meta_envs
            obs = np.asarray(self.env.reset())

            ts = 0

            while n_samples < self.total_samples:

                # execute policy
                t = time.time()
                if eval:
                    H = self.mpc.horizon
                    mean_list = []
                    std_list = []
                    observation = obs
                    for t in range(H + 1):
                        action, agent_info = policy.get_action(observation)
                        action = agent_info['mean']
                        mean_list.append(action)
                        std_list.append(agent_info['log_std'])
                        if self.policy.squashed:
                            action = np.tanh(action)
                        if observation.ndim == 1:
                            observation = observation[None]
                        if action.ndim == 1:
                            action = action[None]
                        observation = dynamics_model.predict(
                            observation, action)
                        observation = observation.reshape((-1))
                    action, _ = self.mpc.get_actions(obs[None], mean_list,
                                                     std_list)
                    if action.ndim == 2:
                        action = action[0]
                else:
                    obs = obs.reshape((-1))
                    if random:
                        action = self.env.action_space.sample()
                        agent_info = {}
                    elif deterministic:
                        action, agent_info = policy.get_action(obs)
                        action = agent_info['mean']
                        if self.policy.squashed:
                            action = np.tanh(action)
                    else:
                        action, agent_info = policy.get_action(obs)
                        if action.ndim == 2:
                            action = action[0]
                    policy_time += time.time() - t

                # step environments
                t = time.time()
                next_obs, reward, done, env_info = self.env.step(action)

                ts += 1

                env_time += time.time() - t

                new_samples = 0

                # append new samples to running paths
                if isinstance(reward, np.ndarray):
                    reward = reward[0]
                running_paths["observations"].append(obs)
                running_paths["actions"].append(action)
                running_paths["rewards"].append(reward)
                running_paths["dones"].append(done)
                running_paths["env_infos"].append(env_info)
                running_paths["agent_infos"].append(agent_info)

                # if running path is done, add it to paths and empty the running path
                if done or ts >= self.max_path_length:
                    paths.append(
                        dict(
                            observations=np.asarray(
                                running_paths["observations"]),
                            actions=np.asarray(running_paths["actions"]),
                            rewards=np.asarray(running_paths["rewards"]),
                            dones=np.asarray(running_paths["dones"]),
                            env_infos=[],
                            agent_infos=[],
                            # env_infos=utils.stack_tensor_dict_list(running_paths["env_infos"]),
                            # agent_infos=utils.stack_tensor_dict_list(running_paths["agent_infos"]),
                        ))
                    new_samples += len(running_paths["rewards"])
                    running_paths = _get_empty_running_paths_dict()

                if done or ts >= self.max_path_length:
                    next_obs = self.env.reset()
                    ts = 0

                if log: pbar.update(new_samples)
                n_samples += new_samples
                obs = next_obs
            multiple_trajectories.append(paths)
        if log: pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return multiple_trajectories
コード例 #5
0
    def obtain_samples(self, log=False, log_prefix='', random=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = []

        n_samples = 0
        running_paths = _get_empty_running_paths_dict()

        pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy
        policy.reset(dones=[True])

        # initial reset of meta_envs
        obs = np.asarray(self.env.reset())

        ts = 0

        while n_samples < self.total_samples:

            # execute policy
            t = time.time()
            if random:
                action = self.env.action_space.sample()
                agent_info = {}
            else:
                action, agent_info = policy.get_action(obs)
                if action.ndim == 2:
                    action = action[0]
            policy_time += time.time() - t

            # step environments
            t = time.time()
            next_obs, reward, done, env_info = self.env.step(action)

            ts += 1
            done = done or ts >= self.max_path_length
            if done:
                next_obs = self.env.reset()
                ts = 0

            env_time += time.time() - t

            new_samples = 0

            # append new samples to running paths
            if isinstance(reward, np.ndarray):
                reward = reward[0]
            running_paths["observations"].append(obs)
            running_paths["actions"].append(action)
            running_paths["rewards"].append(reward)
            running_paths["dones"].append(done)
            running_paths["env_infos"].append(env_info)
            running_paths["agent_infos"].append(agent_info)

            # if running path is done, add it to paths and empty the running path
            if done:
                paths.append(
                    dict(
                        observations=np.asarray(running_paths["observations"]),
                        actions=np.asarray(running_paths["actions"]),
                        rewards=np.asarray(running_paths["rewards"]),
                        dones=np.asarray(running_paths["dones"]),
                        env_infos=utils.stack_tensor_dict_list(
                            running_paths["env_infos"]),
                        agent_infos=utils.stack_tensor_dict_list(
                            running_paths["agent_infos"]),
                    ))
                new_samples += len(running_paths["rewards"])
                running_paths = _get_empty_running_paths_dict()

            pbar.update(new_samples)
            n_samples += new_samples
            obs = next_obs
        pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
コード例 #6
0
    def obtain_samples(self, log=False, log_prefix=''):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger

        Returns:
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = []

        n_samples = 0
        running_paths = dict()

        pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy

        # initial reset of envs
        obses = self.env.reset()

        while n_samples < self.total_samples:
            # execute policy
            t = time.time()
            obs_per_task = np.array(obses)

            actions, logits, values = policy.get_actions(obs_per_task)
            policy_time += time.time() - t

            # step environments
            t = time.time()
            next_obses, rewards, dones, env_infos = self.env.step(actions)

            env_time += time.time() - t

            #  stack agent_infos and if no infos were provided (--> None) create empty dicts
            new_samples = 0
            for observation, action, logit, reward, value, finish_time in zip(
                    obses, actions, logits, rewards, values, env_infos):
                running_paths["observations"] = observation
                running_paths["actions"] = action
                running_paths["logits"] = logit
                running_paths["rewards"] = reward
                running_paths["values"] = value
                running_paths["finish_time"] = finish_time
                # handling

                paths.append(
                    dict(
                        observations=np.squeeze(
                            np.asarray(running_paths["observations"])),
                        actions=np.squeeze(np.asarray(
                            running_paths["actions"])),
                        logits=np.squeeze(np.asarray(running_paths["logits"])),
                        rewards=np.squeeze(np.asarray(
                            running_paths["rewards"])),
                        values=np.squeeze(np.asarray(running_paths["values"])),
                        finish_time=np.squeeze(
                            np.asarray(running_paths["finish_time"]))))

                # if running path is done, add it to paths and empty the running path
                new_samples += len(running_paths["rewards"])
                running_paths = _get_empty_running_paths_dict()

            pbar.update(new_samples)
            n_samples += new_samples
            obses = next_obses
        pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)
        return paths
コード例 #7
0
    def obtain_samples(self, log=False, log_prefix='', random=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns: 
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = OrderedDict()
        for i in range(self.meta_batch_size):
            paths[i] = []

        running_paths = _get_empty_running_paths_dict()

        pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy

        for idx in range(self.meta_batch_size):
            ts = 0
            n_samples = 0

            init_obs = np.expand_dims(self.env.reset(), 0).copy()
            obses = [init_obs for _ in range(self.meta_batch_size)]
            policy.reset(dones=[True] * self.meta_batch_size)
            while n_samples < self.samples_per_task:
                # execute policy
                t = time.time()

                if random:
                    actions = np.stack([[self.env.action_space.sample()]
                                        for _ in range(len(obses))],
                                       axis=0)
                    agent_infos = [[{
                        'mean':
                        np.zeros_like(self.env.action_space.sample()),
                        'log_std':
                        np.zeros_like(self.env.action_space.sample())
                    }] * self.envs_per_task] * self.meta_batch_size
                else:
                    actions, agent_infos = policy.get_actions(obses)

                policy_time += time.time() - t

                # step environments
                t = time.time()
                action, agent_info = actions[idx][0], agent_infos[idx][0]
                observation = obses[idx][0].copy()

                next_obs, reward, done, env_info = self.env.step(action)

                ts += 1
                done = done or ts >= self.max_path_length
                if done:
                    next_obs = self.env.reset()
                    # time.sleep(1)
                    ts = 0

                env_time += time.time() - t

                new_samples = 0
                # append new samples to running paths
                if isinstance(reward, np.ndarray):
                    reward = reward[0]
                running_paths["observations"].append(observation)
                running_paths["actions"].append(action)
                running_paths["rewards"].append(reward)
                running_paths["dones"].append(done)
                running_paths["env_infos"].append(env_info)
                running_paths["agent_infos"].append(agent_info)

                # if running path is done, add it to paths and empty the running path
                if done:
                    paths[idx].append(
                        dict(
                            observations=np.asarray(
                                running_paths["observations"]),
                            actions=np.asarray(running_paths["actions"]),
                            rewards=np.asarray(running_paths["rewards"]),
                            dones=np.asarray(running_paths["dones"]),
                            env_infos=utils.stack_tensor_dict_list(
                                running_paths["env_infos"]),
                            agent_infos=utils.stack_tensor_dict_list(
                                running_paths["agent_infos"]),
                        ))
                    new_samples += len(running_paths["rewards"])
                    running_paths = _get_empty_running_paths_dict()

                pbar.update(new_samples)
                n_samples += new_samples
                obses[idx][0] = next_obs

            self.total_timesteps_sampled += n_samples

        pbar.stop()
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
コード例 #8
0
ファイル: sampler.py プロジェクト: iclavera/meta-mb
    def obtain_samples(self,
                       log=False,
                       log_prefix='',
                       random=False,
                       deterministic=False,
                       sinusoid=False,
                       verbose=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = []

        n_samples = 0
        running_paths = [
            _get_empty_running_paths_dict()
            for _ in range(self.vec_env.num_envs)
        ]

        if verbose: pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy
        policy.reset(dones=[True] * self.vec_env.num_envs)

        # initial reset of meta_envs
        obses = np.asarray(self.vec_env.reset())

        while n_samples < self.total_samples:

            # execute policy
            t = time.time()
            if self.vae is not None:
                obses = np.array(obses)
                obses = self.vae.encode(obses)
            if random:
                actions = np.stack([
                    self.env.action_space.sample()
                    for _ in range(self.vec_env.num_envs)
                ],
                                   axis=0)
                agent_infos = {}
            elif deterministic:
                actions, agent_infos = policy.get_actions(obses)
                actions = [a_i['mean'] for a_i in agent_infos]
            elif sinusoid:
                action_space = self.env.action_space.shape[0]
                num_envs = self.vec_env.num_envs
                actions = np.stack([
                    policy.get_sinusoid_actions(action_space,
                                                t / policy.horizon * 2 * np.pi)
                    for _ in range(num_envs)
                ],
                                   axis=0)
                agent_infos = dict()
            else:
                obses = np.array(obses)
                actions, agent_infos = policy.get_actions(obses)
            policy_time += time.time() - t

            # step environments
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            #  stack agent_infos and if no infos were provided (--> None) create empty dicts
            agent_infos, env_infos = self._handle_info_dicts(
                agent_infos, env_infos)

            new_samples = 0
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                # append new samples to running paths
                if isinstance(reward, np.ndarray):
                    reward = reward[0]
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["dones"].append(done)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)

                # if running path is done, add it to paths and empty the running path
                if done:
                    paths.append(
                        dict(
                            observations=np.asarray(
                                running_paths[idx]["observations"]),
                            actions=np.asarray(running_paths[idx]["actions"]),
                            rewards=np.asarray(running_paths[idx]["rewards"]),
                            dones=np.asarray(running_paths[idx]["dones"]),
                            env_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    new_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = _get_empty_running_paths_dict()

            if verbose: pbar.update(self.vec_env.num_envs)
            n_samples += new_samples
            obses = next_obses
        if verbose: pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "TimeStepsCtr",
                         self.total_timesteps_sampled)
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
コード例 #9
0
    def obtain_samples(self,
                       log=False,
                       log_prefix='',
                       random=False,
                       advance_curriculum=False,
                       policy=None,
                       teacher_dict={},
                       max_action=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns: 
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = OrderedDict()
        for i in range(self.meta_batch_size):
            paths[i] = []

        n_samples = 0
        running_paths = [
            _get_empty_running_paths_dict()
            for _ in range(self.vec_env.num_envs)
        ]

        total_paths = self.rollouts_per_meta_task * self.meta_batch_size * self.envs_per_task
        pbar = ProgBar(total_paths)
        policy_time, env_time = 0, 0

        if policy is None:
            policy = self.policy
        policy.reset(dones=[True] * self.meta_batch_size)
        if self.reward_predictor is not None:
            self.reward_predictor.reset(dones=[True] * self.meta_batch_size)
        if self.supervised_model is not None:
            self.supervised_model.reset(dones=[True] * self.meta_batch_size)
        # initial reset of meta_envs
        if advance_curriculum:
            self.vec_env.advance_curriculum()
        self.update_tasks()

        obses = self.vec_env.reset()

        num_paths = 0
        itrs = 0
        while num_paths < total_paths:
            print("Loop", num_paths, total_paths, itrs)
            itrs += 1
            t = time.time()
            obses = self.obs_preprocessor(obses, teacher_dict)
            if random:
                actions = np.stack([[self.env.action_space.sample()]
                                    for _ in range(len(obses))],
                                   axis=0)
                agent_infos = [[{
                    'mean':
                    np.zeros_like(self.env.action_space.sample()),
                    'log_std':
                    np.zeros_like(self.env.action_space.sample())
                }] * self.envs_per_task] * self.meta_batch_size
            else:
                actions, agent_infos = policy.get_actions_t(obses)
                if max_action:  # TODO: double check this still works
                    assert False, "We haven't checked this still works with the new model; if it does, feel free to delete."
                    original_action_shape = actions.shape
                    actions = [[[np.argmax(d['probs'])] for d in agent_info]
                               for agent_info in agent_infos]
                    actions = np.array(actions, dtype=np.int32)
                    if not actions.shape == original_action_shape:
                        assert False, (actions.shape, original_action_shape)

            policy_time += time.time() - t

            # step environments
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            new_samples = 0
            new_paths = 0
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                # append new samples to running paths
                if isinstance(reward, np.ndarray):
                    reward = reward[0]
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["dones"].append(done)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)

                # if running path is done, add it to paths and empty the running path
                if done:
                    curr_path = paths[idx // self.envs_per_task]
                    if len(curr_path) >= self.rollouts_per_meta_task:
                        continue
                    paths[idx // self.envs_per_task].append(
                        dict(
                            observations=np.asarray(
                                running_paths[idx]["observations"]),
                            actions=np.asarray(running_paths[idx]["actions"]),
                            rewards=np.asarray(running_paths[idx]["rewards"]),
                            dones=np.asarray(running_paths[idx]["dones"]),
                            env_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    num_paths += 1
                    new_paths += 1
                    new_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = _get_empty_running_paths_dict()

            pbar.update(new_paths)
            n_samples += new_samples
            obses = next_obses
        pbar.stop()

        self.total_timesteps_sampled += n_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
コード例 #10
0
    def obtain_samples(self, log=False, log_prefix='', random=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (list): A list of dicts with the samples
        """

        # initial setup / preparation
        paths = []

        n_samples = 0
        num_envs = self.vec_env.num_envs
        running_paths = [
            _get_empty_running_paths_dict() for _ in range(num_envs)
        ]

        pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy
        if self.use_cem:
            for i in range(num_envs):
                self.reset_cem(i)

        # initial reset of meta_envs
        obses = np.asarray(self.vec_env.reset())
        state_counts = [0] * self.vec_env.num_envs

        # history
        self.obs_dim = obses.shape[1]
        history_state = np.zeros(
            (obses.shape[0], self.obs_dim * self.history_length))
        history_act = np.zeros(
            (obses.shape[0], self.act_dim * self.history_length))

        while n_samples < self.total_samples:

            # execute policy
            t = time.time()
            if random:
                actions = np.stack(
                    [self.env.action_space.sample() for _ in range(num_envs)],
                    axis=0)
                agent_infos = {}
            else:
                if self.use_cem:
                    if self.context:
                        cem_solutions, agent_infos = policy.get_actions(
                            obses,
                            init_mean=self.prev_sol,
                            init_var=self.init_var,
                            cp_obs=history_state,
                            cp_act=history_act)
                    else:
                        cem_solutions, agent_infos = policy.get_actions(
                            obses,
                            init_mean=self.prev_sol,
                            init_var=self.init_var)
                    self.prev_sol[:, :-1] = cem_solutions[:, 1:].copy()
                    self.prev_sol[:, -1:] = 0.
                    actions = cem_solutions[:, 0].copy()
                else:
                    if self.context:
                        actions, agent_infos = policy.get_actions(
                            obses, cp_obs=history_state, cp_act=history_act)
                    else:
                        actions, agent_infos = policy.get_actions(obses)
                if len(self.env.action_space.shape) == 0:
                    actions = actions.reshape(-1)

            policy_time += time.time() - t

            # step environments
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            #  stack agent_infos and if no infos were provided (--> None) create empty dicts
            agent_infos, env_infos = self._handle_info_dicts(
                agent_infos, env_infos)

            new_samples = 0
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if len(self.env.action_space.shape) == 0:
                    action = np.eye(self.act_dim)[action]
                else:
                    if action.ndim == 0:
                        action = np.expand_dims(action, 0)
                assert action.ndim == 1, (action, action.shape)

                # append new samples to running paths
                if isinstance(reward, np.ndarray):
                    reward = reward[0]
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["dones"].append(done)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                running_paths[idx]["cp_obs"].append(history_state[idx].copy())
                running_paths[idx]["cp_act"].append(history_act[idx].copy())

                # making a history buffer
                if state_counts[idx] < self.history_length:
                    if self.state_diff:
                        history_state[idx][state_counts[idx] * self.obs_dim:(
                            state_counts[idx] +
                            1) * self.obs_dim] = next_obses[idx] - observation
                    else:
                        history_state[idx][state_counts[idx] *
                                           self.obs_dim:(state_counts[idx] +
                                                         1) *
                                           self.obs_dim] = observation
                    history_act[idx][state_counts[idx] *
                                     self.act_dim:(state_counts[idx] + 1) *
                                     self.act_dim] = action
                else:
                    history_state[idx][:-self.obs_dim] = history_state[idx][
                        self.obs_dim:]
                    if self.state_diff:
                        history_state[idx][
                            -self.obs_dim:] = next_obses[idx] - observation
                    else:
                        history_state[idx][-self.obs_dim:] = observation
                    history_act[idx][:-self.
                                     act_dim] = history_act[idx][self.act_dim:]
                    history_act[idx][-self.act_dim:] = action

                # if running path is done, add it to paths and empty the running path
                if done:
                    paths.append(
                        dict(
                            observations=np.asarray(
                                running_paths[idx]["observations"]),
                            actions=np.asarray(running_paths[idx]["actions"]),
                            rewards=np.asarray(running_paths[idx]["rewards"]),
                            dones=np.asarray(running_paths[idx]["dones"]),
                            env_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                            cp_obs=np.asarray(running_paths[idx]["cp_obs"]),
                            cp_act=np.asarray(running_paths[idx]["cp_act"]),
                        ))
                    new_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = _get_empty_running_paths_dict()
                    if not random and self.use_cem:
                        self.reset_cem(idx)

                    state_counts[idx] = 0
                    history_state[idx] = np.zeros(
                        (self.obs_dim * self.history_length))
                    history_act[idx] = np.zeros(
                        (self.act_dim * self.history_length))
                else:
                    state_counts[idx] += 1
            pbar.update(self.vec_env.num_envs)
            n_samples += new_samples
            obses = next_obses
        pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
コード例 #11
0
    def obtain_samples(self, log=False, log_prefix=''):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger

        Returns:
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = OrderedDict()
        for i in range(self.meta_batch_size):
            paths[i] = []

        n_samples = 0
        running_paths = [
            _get_empty_running_paths_dict()
            for _ in range(self.vec_env.num_envs)
        ]

        pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy

        # initial reset of envs
        obses = self.vec_env.reset()

        while n_samples < self.total_samples:
            # execute policy
            t = time.time()
            # obs_per_task = np.split(np.asarray(obses), self.meta_batch_size)
            obs_per_task = np.array(obses)

            actions, logits, values = policy.get_actions(obs_per_task)
            policy_time += time.time() - t

            # step environments
            t = time.time()
            # actions = np.concatenate(actions)

            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)

            # print("rewards shape is: ", np.array(rewards).shape)
            # print("finish time shape is: ", np.array(env_infos).shape)

            env_time += time.time() - t

            #  stack agent_infos and if no infos were provided (--> None) create empty dicts
            new_samples = 0
            for idx, observation, action, logit, reward, value, done, task_finish_times in zip(
                    itertools.count(), obses, actions, logits, rewards, values,
                    dones, env_infos):
                # append new samples to running paths

                # handling
                for single_ob, single_ac, single_logit, single_reward, single_value, single_task_finish_time \
                        in zip(observation, action, logit, reward, value, task_finish_times):
                    running_paths[idx]["observations"] = single_ob
                    running_paths[idx]["actions"] = single_ac
                    running_paths[idx]["logits"] = single_logit
                    running_paths[idx]["rewards"] = single_reward
                    running_paths[idx]["finish_time"] = single_task_finish_time
                    running_paths[idx]["values"] = single_value

                    paths[idx // self.envs_per_task].append(
                        dict(observations=np.squeeze(
                            np.asarray(running_paths[idx]["observations"])),
                             actions=np.squeeze(
                                 np.asarray(running_paths[idx]["actions"])),
                             logits=np.squeeze(
                                 np.asarray(running_paths[idx]["logits"])),
                             rewards=np.squeeze(
                                 np.asarray(running_paths[idx]["rewards"])),
                             finish_time=np.squeeze(
                                 np.asarray(
                                     running_paths[idx]["finish_time"])),
                             values=np.squeeze(
                                 np.asarray(running_paths[idx]["values"]))))

                    # if running path is done, add it to paths and empty the running path
                    new_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = _get_empty_running_paths_dict()

            pbar.update(new_samples)
            n_samples += new_samples
            obses = next_obses
        pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)
        return paths
コード例 #12
0
    def obtain_samples(self, log=False, log_prefix='', test=False):

        print("total_samples:",self.total_samples)
        print("meta_batch_size:", self.meta_batch_size)
        print("max_path_length:" ,self.max_path_length)

        print("--------------obtaining", self.total_samples//self.meta_batch_size//self.max_path_length,
              "rollouts_per_task, for", self.meta_batch_size, "tasks..--------------")

        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger

        Returns: 
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = OrderedDict()
        for i in range(self.meta_batch_size):
            paths[i] = []

        n_samples = 0

        running_paths = [_get_empty_running_paths_dict() for _ in range(self.vec_env.num_envs)]
        print("                runnng_paths length:", len(running_paths))

        pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy

        # initial reset of envs
        obses = self.vec_env.reset()
        
        while n_samples < self.total_samples:
            # execute policy
            t = time.time()
            obs_per_task = np.split(np.asarray(obses), self.meta_batch_size)
            actions, agent_infos = policy.get_actions(obs_per_task)
            policy_time += time.time() - t

            # step environments
            t = time.time()
            actions = np.concatenate(actions) # stack meta batch
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            #  stack agent_infos and if no infos were provided (--> None) create empty dicts
            agent_infos, env_infos = self._handle_info_dicts(agent_infos, env_infos)

            new_samples = 0
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                # append new samples to running paths
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)

                # if running path is done, add it to paths and empty the running path
                if done:
                    paths[idx // self.envs_per_task].append(dict(
                        observations=np.asarray(running_paths[idx]["observations"]),
                        actions=np.asarray(running_paths[idx]["actions"]),
                        rewards=np.asarray(running_paths[idx]["rewards"]),
                        env_infos=utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    new_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = _get_empty_running_paths_dict()

            pbar.update(new_samples)
            n_samples += new_samples
            obses = next_obses

        pbar.stop()

        if not test:
            self.total_timesteps_sampled += self.total_samples
            print("------------self.total_timesteps_sampled:", self.total_timesteps_sampled, "-----------------")
        else:
            print("------------tested on:", self.total_samples // self.max_path_length, " rollouts-----------------")

        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths
コード例 #13
0
    def obtain_samples(self, log=False, log_prefix='', buffer=None):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger

        Returns: 
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        pbar = ProgBar(self.max_path_length)
        policy_time, env_time = 0, 0

        policy = self.policy
        policy.reset(dones=[True] * self.vec_env.num_envs)

        # initial reset of meta_envs
        obses = self.vec_env.reset(buffer)
        time_step = 0
        list_observations = []
        list_actions = []
        list_rewards = []
        list_dones = []
        mask = np.ones((self.vec_env.num_envs, ))

        while time_step < self.max_path_length:

            # Execute policy
            t = time.time()
            if self.vae is not None:
                obses = np.array(obses)
                obses = self.vae.encode(obses)
                obses = np.split(obses, self.vec_env.num_envs, axis=0)
            if self.dynamics_model is not None:
                actions, agent_infos = policy.get_actions_batch(
                    obses, update_filter=False)
            else:
                obses = np.array(obses)
                actions, agent_infos = policy.get_actions_batch(
                    obses, update_filter=True)
            policy_time += time.time() - t

            # Step environments
            t = time.time()
            next_obses, rewards, dones, _ = self.vec_env.step(actions)
            next_obses, rewards, dones = np.array(next_obses), np.array(
                rewards), np.array(dones)

            rewards *= mask
            dones = dones + (1 - mask)
            mask *= (1 - dones)

            env_time += time.time() - t

            list_observations.append(obses)
            list_actions.append(actions)
            list_rewards.append(rewards)
            list_dones.append(dones)

            time_step += 1
            obses = next_obses
            pbar.update(1)
        pbar.stop()
        self.total_timesteps_sampled += np.sum(1 - np.array(list_dones))

        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        samples_data = dict(observations=np.array(list_observations),
                            actions=np.array(list_actions),
                            rewards=np.array(list_rewards),
                            returns=np.sum(list_rewards, axis=0),
                            dones=np.array(list_dones))

        return samples_data