Exemple #1
0
    def test_concat_tensor_dict_list(self):
        results = concat_tensor_dict_list(self.data)
        assert results['obs'].shape == (6, )
        assert results['act'].shape == (6, )
        assert results['info']['lala'].shape == (4, )
        assert results['info']['baba'].shape == (4, )

        results = concat_tensor_dict_list(self.data2)
        assert results['obs'].shape == (6, )
        assert results['act'].shape == (6, )
        assert results['info']['lala'].shape == (4, )
        assert results['info']['baba'].shape == (2, )
Exemple #2
0
    def from_trajectory_list(cls, env_spec, num_skills, paths):
        lengths = np.asarray([len(p['self_rewards']) for p in paths])
        if all(
            len(path['states']) == length + 1 for (path, length) in zip(paths,
                                                                        lengths)):
            last_states = np.asarray([p['states'][-1] for p in paths])
            states = np.concatenate([p['states'][:-1] for p in paths])
        else:
            # The number of observations and timesteps must match.
            states = np.concatenate([p['states'] for p in paths])

            if paths[0].get('next_states') is not None:
                last_states = np.asarray([p['next_states'][-1] for p in paths])
            else:
                last_states = np.asarray([p['states'][-1] for p in paths])

        stacked_paths = tensor_utils.concat_tensor_dict_list(paths)
        return cls(env_spec=env_spec,
                   num_skills=num_skills,
                   skills=stacked_paths['skills'],
                   # skills_onehot=np.eye(num_skills)[stacked_paths['skills']],
                   states=states,
                   last_states=last_states,
                   actions=stacked_paths['actions'],
                   env_rewards=stacked_paths['env_rewards'],
                   self_rewards=stacked_paths['self_rewards'],
                   terminals=stacked_paths['dones'],
                   env_infos=stacked_paths['env_infos'],
                   agent_infos=stacked_paths['agent_infos'],
                   lengths=lengths)
Exemple #3
0
    def step(self, action_n):
        results = singleton_pool.run_each(
            worker_run_step,
            [(action_n, self.scope) for _ in self._alloc_env_ids],
        )
        results = [x for x in results if x is not None]
        ids, obs, rewards, dones, env_infos = list(zip(*results))
        ids = np.concatenate(ids)
        obs = self.observation_space.unflatten_n(np.concatenate(obs))
        rewards = np.concatenate(rewards)
        dones = np.concatenate(dones)
        env_infos = tensor_utils.split_tensor_dict_list(
            tensor_utils.concat_tensor_dict_list(env_infos))
        if env_infos is None:
            env_infos = [dict() for _ in range(self.num_envs)]

        items = list(zip(ids, obs, rewards, dones, env_infos))
        items = sorted(items, key=lambda x: x[0])

        ids, obs, rewards, dones, env_infos = list(zip(*items))

        obs = list(obs)
        rewards = np.asarray(rewards)
        dones = np.asarray(dones)

        self.ts += 1
        dones[self.ts >= self.max_path_length] = True

        reset_obs = self._run_reset(dones)
        for (i, done) in enumerate(dones):
            if done:
                obs[i] = reset_obs[i]
                self.ts[i] = 0
        return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(
            list(env_infos))
Exemple #4
0
    def _concatenate_paths(self, paths):
        """Concatenate paths.

        The input paths are from different rollouts but same task/environment.
        In RL^2, paths within each meta batch are all concatenate into a single
        path and fed to the policy.

        Args:
            paths (dict): Input paths. All paths are from different rollouts,
                but the same task/environment.

        Returns:
            dict: Concatenated paths from the same task/environment. Shape of
                values: :math:`[max_path_length * episode_per_task, S^*]`
            list[dict]: Original input paths. Length of the list is
                :math:`episode_per_task` and each path in the list has
                values of shape :math:`[max_path_length, S^*]`

        """
        if self._flatten_input:
            observations = np.concatenate([
                self._env_spec.observation_space.flatten_n(
                    path['observations']) for path in paths
            ])
        else:
            observations = np.concatenate(
                [path['observations'] for path in paths])
        actions = np.concatenate([
            self._env_spec.action_space.flatten_n(path['actions'])
            for path in paths
        ])
        valids = np.concatenate(
            [np.ones_like(path['rewards']) for path in paths])
        baselines = np.concatenate(
            [np.zeros_like(path['rewards']) for path in paths])

        concatenated_path = np_tensor_utils.concat_tensor_dict_list(paths)
        concatenated_path['observations'] = observations
        concatenated_path['actions'] = actions
        concatenated_path['valids'] = valids
        concatenated_path['baselines'] = baselines

        return concatenated_path
Exemple #5
0
    def from_trajectory_list(cls, env_spec, paths):
        """Create a TrajectoryBatch from a list of trajectories.

    Args:
        env_spec (garage.envs.EnvSpec): Specification for the environment
            from which this data was sampled.
        paths (list[dict[str, np.ndarray or dict[str, np.ndarray]]]): Keys:
            * observations (np.ndarray): Non-flattened array of
                observations. Typically has shape (T, S^*) (the unflattened
                state space of the current environment). observations[i]
                was used by the agent to choose actions[i]. observations
                may instead have shape (T + 1, S^*).
            * next_observations (np.ndarray): Non-flattened array of
                observations. Has shape (T, S^*). next_observations[i] was
                observed by the agent after taking actions[i]. Optional.
                Note that to ensure all information from the environment
                was preserved, observations[i] should have shape (T + 1,
                S^*), or this key should be set. However, this method is
                lenient and will "duplicate" the last observation if the
                original last observation has been lost.
            * actions (np.ndarray): Non-flattened array of actions. Should
                have shape (T, S^*) (the unflattened action space of the
                current environment).
            * rewards (np.ndarray): Array of rewards of shape (T,) (1D
                array of length timesteps).
            * dones (np.ndarray): Array of rewards of shape (T,) (1D array
                of length timesteps).
            * agent_infos (dict[str, np.ndarray]): Dictionary of stacked,
                non-flattened `agent_info` arrays.
            * env_infos (dict[str, np.ndarray]): Dictionary of stacked,
                non-flattened `env_info` arrays.

    """
        lengths = np.asarray([len(p["rewards"]) for p in paths])
        if all(
                len(path["observations"]) == length + 1
                for (path, length) in zip(paths, lengths)):
            last_observations = np.asarray(
                [p["observations"][-1] for p in paths])
            observations = np.concatenate(
                [p["observations"][:-1] for p in paths])
        else:
            # The number of observations and timesteps must match.
            observations = np.concatenate([p["observations"] for p in paths])
            if paths[0].get("next_observations") is not None:
                last_observations = np.asarray(
                    [p["next_observations"][-1] for p in paths])
            else:
                last_observations = np.asarray(
                    [p["observations"][-1] for p in paths])

        stacked_paths = tensor_utils.concat_tensor_dict_list(paths)
        return cls(
            env_spec=env_spec,
            observations=observations,
            last_observations=last_observations,
            actions=stacked_paths["actions"],
            rewards=stacked_paths["rewards"],
            terminals=stacked_paths["dones"],
            env_infos=stacked_paths["env_infos"],
            agent_infos=stacked_paths["agent_infos"],
            lengths=lengths,
        )
Exemple #6
0
    def from_list(cls, env_spec, paths):
        """Create a EpisodeBatch from a list of episodes.

        Args:
            env_spec (EnvSpec): Specification for the environment from which
                this data was sampled.
            paths (list[dict[str, np.ndarray or dict[str, np.ndarray]]]): Keys:
                * observations (np.ndarray): Non-flattened array of
                    observations. Typically has shape (T, S^*) (the unflattened
                    state space of the current environment). observations[i]
                    was used by the agent to choose actions[i]. observations
                    may instead have shape (T + 1, S^*).
                * next_observations (np.ndarray): Non-flattened array of
                    observations. Has shape (T, S^*). next_observations[i] was
                    observed by the agent after taking actions[i]. Optional.
                    Note that to ensure all information from the environment
                    was preserved, observations[i] should have shape (T + 1,
                    S^*), or this key should be set. However, this method is
                    lenient and will "duplicate" the last observation if the
                    original last observation has been lost.
                * actions (np.ndarray): Non-flattened array of actions. Should
                    have shape (T, S^*) (the unflattened action space of the
                    current environment).
                * rewards (np.ndarray): Array of rewards of shape (T,) (1D
                    array of length timesteps).
                * agent_infos (dict[str, np.ndarray]): Dictionary of stacked,
                    non-flattened `agent_info` arrays.
                * env_infos (dict[str, np.ndarray]): Dictionary of stacked,
                    non-flattened `env_info` arrays.
                * step_types (numpy.ndarray): A numpy array of `StepType with
                    shape (T,) containing the time step types for all
                    transitions in this batch.

        """
        lengths = np.asarray([len(p['rewards']) for p in paths])
        if all(
                len(path['observations']) == length + 1
                for (path, length) in zip(paths, lengths)):
            last_observations = np.asarray(
                [p['observations'][-1] for p in paths])
            observations = np.concatenate(
                [p['observations'][:-1] for p in paths])
        else:
            # The number of observations and timesteps must match.
            observations = np.concatenate([p['observations'] for p in paths])
            if paths[0].get('next_observations') is not None:
                last_observations = np.asarray(
                    [p['next_observations'][-1] for p in paths])
            else:
                last_observations = np.asarray(
                    [p['observations'][-1] for p in paths])

        stacked_paths = tensor_utils.concat_tensor_dict_list(paths)

        # Temporary solution. This logic is not needed if algorithms process
        # step_types instead of dones directly.
        if 'dones' in stacked_paths and 'step_types' not in stacked_paths:
            step_types = np.array([
                StepType.TERMINAL if done else StepType.MID
                for done in stacked_paths['dones']
            ],
                                  dtype=StepType)
            stacked_paths['step_types'] = step_types
            del stacked_paths['dones']

        return cls(env_spec=env_spec,
                   observations=observations,
                   last_observations=last_observations,
                   actions=stacked_paths['actions'],
                   rewards=stacked_paths['rewards'],
                   env_infos=stacked_paths['env_infos'],
                   agent_infos=stacked_paths['agent_infos'],
                   step_types=stacked_paths['step_types'],
                   lengths=lengths)
Exemple #7
0
    def process_samples(self, itr, paths):
        baselines = []
        returns = []

        if hasattr(self.algo.baseline, 'predict_n'):
            all_path_baselines = self.algo.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.algo.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path['rewards'] + \
                self.algo.discount * path_baselines[1:] - path_baselines[:-1]
            path['advantages'] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path['returns'] = special.discount_cumsum(path['rewards'],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path['returns'])

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path['observations'] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path['actions'] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path['rewards'] for path in paths])
            returns = tensor_utils.concat_tensor_list(
                [path['returns'] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path['advantages'] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path['env_infos'] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path['agent_infos'] for path in paths])

            if self.algo.center_adv:
                advantages = utils.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = utils.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path['returns'][0] for path in paths])

            undiscounted_returns = [sum(path['rewards']) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path['advantages']) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path['observations'] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path['advantages'] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path['advantages'] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path['advantages'] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path['actions'] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path['rewards'] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path['returns'] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path['agent_infos'] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path['env_infos'] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path['returns']) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path['returns'][0] for path in paths])

            undiscounted_returns = [sum(path['rewards']) for path in paths]

            ent = np.sum(
                self.algo.policy.distribution.entropy(agent_infos) *
                valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log('fitting baseline...')
        if hasattr(self.algo.baseline, 'fit_with_samples'):
            self.algo.baseline.fit_with_samples(paths, samples_data)
        else:
            self.algo.baseline.fit(paths)
        logger.log('fitted')

        tabular.record('Iteration', itr)
        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('ExplainedVariance', ev)
        tabular.record('NumTrajs', len(paths))
        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))

        return samples_data
Exemple #8
0
    def process_samples_discount(self, itr, paths):
        baselines = []
        returns = []

        if hasattr(self.algo.baseline, "predict_n"):
            all_path_baselines = self.algo.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.algo.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            advantages = []
            path_returns = []
            '''
            path_baselines = all_path_baselines[idx]
            return_so_far = 0
            for t in range(len(path["rewards"])-1, -1, -1):
                return_so_far = path["rewards"][t] + self.algo.discount * return_so_far
                path_returns.append(return_so_far)
                advantage = return_so_far - path_baselines[t]
                advantages.append(advantage)
            '''
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            advantages = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            # correction
            discount_array = self.algo.discount**np.arange(len(
                path["rewards"]))
            path['advantages'] = advantages * discount_array
            '''
            path_returns = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            path['returns'] = path_returns * discount_array
            '''
            path['returns'] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        observations = tensor_utils.concat_tensor_list(
            [path["observations"] for path in paths])
        actions = tensor_utils.concat_tensor_list(
            [path["actions"] for path in paths])
        rewards = tensor_utils.concat_tensor_list(
            [path["rewards"] for path in paths])
        returns = tensor_utils.concat_tensor_list(
            [path["returns"] for path in paths])
        advantages = tensor_utils.concat_tensor_list(
            [path["advantages"] for path in paths])
        env_infos = tensor_utils.concat_tensor_dict_list(
            [path["env_infos"] for path in paths])
        agent_infos = tensor_utils.concat_tensor_dict_list(
            [path["agent_infos"] for path in paths])

        if self.algo.center_adv:
            advantages = utils.center_advantages(advantages)

        if self.algo.positive_adv:
            advantages = utils.shift_advantages_to_positive(advantages)

        average_discounted_return = \
            np.mean([path["returns"][0] for path in paths])

        undiscounted_returns = [sum(path["rewards"]) for path in paths]

        ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

        samples_data = dict(
            observations=observations,
            actions=actions,
            rewards=rewards,
            returns=returns,
            advantages=advantages,
            env_infos=env_infos,
            agent_infos=agent_infos,
            paths=paths,
        )

        logger.log("fitting Exp_paper...")
        if hasattr(self.algo.baseline, 'fit_with_samples'):
            self.algo.baseline.fit_with_samples(paths, samples_data)
        else:
            self.algo.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data