Esempio n. 1
0
    def value_step(self, dataset: TrajectoryOnPolicy):
        """
        Take an optimizer step fitting the value function
        Args:
            dataset: NameTuple with obs, returns, and values
        Returns:
            Dict with loss of the value regression

        """

        obs, returns, old_values = dataset.obs, dataset.returns, dataset.values

        vf_losses = tensorize(0., self.cpu, self.dtype)

        for _ in range(self.val_epochs):
            splits = generate_minibatches(obs.shape[0], self.n_minibatches)

            # Minibatch SGD
            for indices in splits:
                batch = select_batch(indices, returns, old_values, obs)

                sel_returns, sel_old_values, sel_obs = batch
                vs = self.vf_model(sel_obs)

                vf_loss = self.value_loss(vs, sel_returns, sel_old_values)

                self.optimizer_vf.zero_grad()
                vf_loss.backward()
                self.optimizer_vf.step()
                vf_losses += vf_loss.detach()

        steps = self.val_epochs * (math.ceil(
            obs.shape[0] / self.n_minibatches))
        return {"vf_loss": (vf_losses / steps)}
Esempio n. 2
0
    def advantage_and_return(self, rewards: ch.Tensor, values: ch.Tensor,
                             dones: ch.Tensor, time_limit_dones: ch.Tensor):
        """
        Calculate advantage (with GAE) and discounted returns.
        Further, provides specific treatment for terminal states which reached an artificial maximum horizon.

        GAE: h_t^V = r_t + y * V(s_t+1) - V(s_t)
        with
        V(s_t+1) = {0 if s_t is terminal
                   {V(s_t+1) if s_t not terminal and t != T (last step)
                   {V(s) if s_t not terminal and t == T

        Args:
            rewards: Rewards from environment
            values: Value estimates
            dones: Done flags for true termination
            time_limit_dones: Done flags for reaching artificial maximum time limit

        Returns:
            Advantages and Returns

        """
        returns = tensorize(
            np.zeros((self.rollout_steps + 1, self.env_runner.n_envs)),
            self.cpu, self.dtype)
        masks = ~dones
        time_limit_masks = ~time_limit_dones

        if self.use_gae:
            gae = 0
            for step in reversed(range(rewards.size(0))):
                delta = rewards[step] + self.discount_factor * values[
                    step + 1] * masks[step] - values[step]
                gae = delta + self.discount_factor * self.gae_scaling * masks[
                    step] * gae
                # when artificial time limit is reached, take current state's vf estimate as V(s) = r + yV(s')
                gae = gae * time_limit_masks[step]
                returns[step] = gae + values[step]
        else:
            returns[-1] = values[-1]
            for step in reversed(range(rewards.size(0))):
                returns[step] = (returns[step + 1] * self.discount_factor * masks[step] + rewards[step]) * \
                                time_limit_masks[step] + time_limit_dones[step] * values[step]

        returns = returns[:-1]
        advantages = returns - values[:-1]

        return advantages.clone().detach(), returns.clone().detach()
Esempio n. 3
0
    def evaluate_policy(self,
                        policy: AbstractGaussianPolicy,
                        render: bool = False,
                        deterministic: bool = True):
        """
        Evaluate a given policy
        Args:
            policy: policy to evaluate
            render: render policy behavior
            deterministic: choosing deterministic actions

        Returns:
            Dict with performance metrics.
        """
        if self.n_test_envs == 0:
            return
        n_runs = 1
        ep_rewards = np.zeros((
            n_runs,
            self.n_test_envs,
        ))
        ep_lengths = np.zeros((
            n_runs,
            self.n_test_envs,
        ))

        for i in range(n_runs):
            not_dones = np.ones((self.n_test_envs, ), np.bool)
            obs = self.envs.reset_test()
            while np.any(not_dones):
                ep_lengths[i, not_dones] += 1
                if render:
                    self.envs.render_test(mode="human")
                with ch.no_grad():
                    p = policy(tensorize(obs, self.cpu, self.dtype))
                    actions = p[0] if deterministic else policy.sample(p)
                    actions = policy.squash(actions)
                obs, rews, dones, infos = self.envs.step_test(
                    get_numpy(actions))
                ep_rewards[i, not_dones] += rews[not_dones]

                # only set to False when env has never terminated before, otherwise we favor earlier terminating envs.
                not_dones = np.logical_and(~dones, not_dones)

        return self.get_reward_dict(ep_rewards, ep_lengths)
Esempio n. 4
0
    def run(self,
            rollout_steps,
            policy: AbstractGaussianPolicy,
            vf_model: Union[VFNet, None] = None,
            reset_envs: bool = False) -> TrajectoryOnPolicyRaw:
        """
        Generate trajectories of the environment.
        Args:
            rollout_steps: Number of steps to generate
            policy: Policy model to generate samples for
            vf_model: vf model to generate value estimate for all states.
            reset_envs: Whether to reset all envs in the beginning.

        Returns:
            Trajectory with the respective data as torch tensors.
        """

        # Here, we init the lists that will contain the mb of experiences
        num_envs = self.n_envs

        base_shape = (rollout_steps, num_envs)
        base_shape_p1 = (rollout_steps + 1, num_envs)
        base_action_shape = base_shape + self.envs.action_space.shape

        mb_obs = ch.zeros(base_shape_p1 + self.envs.observation_space.shape,
                          dtype=self.dtype)
        mb_actions = ch.zeros(base_action_shape, dtype=self.dtype)
        mb_rewards = ch.zeros(base_shape, dtype=self.dtype)
        mb_dones = ch.zeros(base_shape, dtype=ch.bool)
        ep_infos = []

        mb_time_limit_dones = ch.zeros(base_shape, dtype=ch.bool)
        mb_means = ch.zeros(base_action_shape, dtype=self.dtype)
        mb_stds = ch.zeros(base_action_shape + self.envs.action_space.shape,
                           dtype=self.dtype)

        # continue from last state
        # Before first step we already have self.obs because env calls self.obs = env.reset() on init
        obs = self.envs.reset() if reset_envs else self.envs.last_obs
        obs = tensorize(obs, self.cpu, self.dtype)

        # For n in range number of steps
        for i in range(rollout_steps):
            # Given observations, get action value and lopacs
            pds = policy(obs, train=False)
            actions = policy.sample(pds)
            squashed_actions = policy.squash(actions)

            mb_obs[i] = obs
            mb_actions[i] = squashed_actions

            obs, rewards, dones, infos = self.envs.step(
                squashed_actions.cpu().numpy())
            obs = tensorize(obs, self.cpu, self.dtype)

            mb_means[i] = pds[0]
            mb_stds[i] = pds[1]
            mb_time_limit_dones[i] = tensorize(infos["horizon"], self.cpu,
                                               ch.bool)

            if infos.get("done"):
                ep_infos.extend(infos.get("done"))

            mb_rewards[i] = tensorize(rewards, self.cpu, self.dtype)
            mb_dones[i] = tensorize(dones, self.cpu, ch.bool)

        # need value prediction for last obs in rollout to estimate loss
        mb_obs[-1] = obs

        # compute all logpacs and value estimates at once --> less computation
        mb_logpacs = policy.log_probability((mb_means, mb_stds), mb_actions)
        mb_values = (vf_model if vf_model else policy.get_value)(mb_obs,
                                                                 train=False)

        out = (mb_obs[:-1], mb_actions, mb_logpacs, mb_rewards, mb_values,
               mb_dones, mb_time_limit_dones, mb_means, mb_stds)

        if not self.cpu:
            out = tuple(map(to_gpu, out))

        if ep_infos:
            ep_infos = np.array(ep_infos)
            ep_length, ep_reward = ep_infos[:, 0], ep_infos[:, 1]
            self.total_rewards.extend(ep_reward)
            self.total_steps.extend(ep_length)

        return TrajectoryOnPolicyRaw(*out)
    def __init__(
        self,
        proj_type: str = "",
        mean_bound: float = 0.03,
        cov_bound: float = 1e-3,
        trust_region_coeff: float = 0.0,
        scale_prec: bool = True,
        entropy_schedule: Union[None, str] = None,
        action_dim: Union[None, int] = None,
        total_train_steps: Union[None, int] = None,
        target_entropy: float = 0.0,
        temperature: float = 0.5,
        entropy_eq: bool = False,
        entropy_first: bool = False,
        do_regression: bool = False,
        regression_iters: int = 1000,
        regression_lr: int = 3e-4,
        optimizer_type_reg: str = "adam",
        cpu: bool = True,
        dtype: ch.dtype = ch.float32,
    ):
        """
        Base projection layer, which can be used to compute metrics for non-projection approaches.
        Args:
           proj_type: Which type of projection to use. None specifies no projection and uses the TRPO objective.
           mean_bound: projection bound for the step size w.r.t. mean
           cov_bound: projection bound for the step size w.r.t. covariance matrix
           trust_region_coeff: Coefficient for projection regularization loss term.
           scale_prec: If true used mahalanobis distance for projections instead of euclidean with Sigma_old^-1.
           entropy_schedule: Schedule type for entropy projection, one of 'linear', 'exp', None.
           action_dim: number of action dimensions to scale exp decay correctly.
           total_train_steps: total number of training steps to compute appropriate decay over time.
           target_entropy: projection bound for the entropy of the covariance matrix
           temperature: temperature decay for exponential entropy bound
           entropy_eq: Use entropy equality constraints.
           entropy_first: Project entropy before trust region.
           do_regression: Conduct additional regression steps after the the policy steps to match projection and policy.
           regression_iters: Number of regression steps.
           regression_lr: Regression learning rate.
           optimizer_type_reg: Optimizer for regression.
           cpu: Compute on CPU only.
           dtype: Data type to use, either of float32 or float64. The later might be necessary for higher
                   dimensions in order to learn the full covariance.
        """

        # projection and bounds
        self.proj_type = proj_type
        self.mean_bound = tensorize(mean_bound, cpu=cpu, dtype=dtype)
        self.cov_bound = tensorize(cov_bound, cpu=cpu, dtype=dtype)
        self.trust_region_coeff = trust_region_coeff
        self.scale_prec = scale_prec

        # projection utils
        assert (action_dim and total_train_steps) if entropy_schedule else True
        self.entropy_proj = entropy_equality_projection if entropy_eq else entropy_inequality_projection
        self.entropy_schedule = get_entropy_schedule(entropy_schedule,
                                                     total_train_steps,
                                                     dim=action_dim)
        self.target_entropy = tensorize(target_entropy, cpu=cpu, dtype=dtype)
        self.entropy_first = entropy_first
        self.entropy_eq = entropy_eq
        self.temperature = temperature
        self._initial_entropy = None

        # regression
        self.do_regression = do_regression
        self.regression_iters = regression_iters
        self.lr_reg = regression_lr
        self.optimizer_type_reg = optimizer_type_reg
Esempio n. 6
0
    def policy_step(self, dataset: TrajectoryOnPolicy):
        """
        Policy optimization step
        Args:
            dataset: NameTuple with obs, actions, logpacs, returns, advantages, values, and q
        Returns:
            Dict with total loss, policy loss, entropy loss, and trust region loss
        """

        obs, actions, old_logpacs, returns, advantages, q = \
            dataset.obs, dataset.actions, dataset.logpacs, dataset.returns, dataset.advantages, dataset.q

        losses, vf_losses, surrogates, entropy_losses, trust_region_losses = \
            [tensorize(0., self.cpu, self.dtype) for _ in range(5)]

        # set initial entropy value in first step to calculate appropriate entropy decay
        if self.projection.initial_entropy is None:
            self.projection.initial_entropy = self.policy.entropy(q).mean()

        for _ in range(self.epochs):
            batch_indices = generate_minibatches(obs.shape[0],
                                                 self.n_minibatches)

            # Minibatches SGD
            for indices in batch_indices:
                batch = select_batch(indices, obs, actions, old_logpacs,
                                     advantages, q[0], q[1])
                b_obs, b_actions, b_old_logpacs, b_advantages, b_old_mean, b_old_std = batch
                b_q = (b_old_mean, b_old_std)

                p = self.policy(b_obs)
                proj_p = self.projection(self.policy, p, b_q,
                                         self._global_steps)

                new_logpacs = self.policy.log_probability(proj_p, b_actions)

                # Calculate policy rewards
                surrogate_loss = self.surrogate_loss(b_advantages, new_logpacs,
                                                     b_old_logpacs)

                # Calculate entropy bonus
                entropy_loss = -self.entropy_coeff * self.policy.entropy(
                    proj_p).mean()

                # Trust region loss
                trust_region_loss = self.projection.get_trust_region_loss(
                    self.policy, p, proj_p)

                # Total loss
                loss = surrogate_loss + entropy_loss + trust_region_loss

                # If we are sharing weights, take the value step simultaneously
                if self.vf_coeff > 0 and not self.vf_model:
                    # if no vf model is present, the model is part of the policy, therefore has to be trained jointly
                    batch_vf = select_batch(indices, returns, dataset.values)
                    vs = self.policy.get_value(b_obs)
                    vf_loss = self.value_loss(
                        vs, *batch_vf)  # b_returns, b_old_values)
                    loss += self.vf_coeff * vf_loss
                    vf_losses += vf_loss.detach()

                self.optimizer.zero_grad()
                loss.backward()
                if self.max_grad_norm > 0:
                    ch.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                                self.max_grad_norm)
                self.optimizer.step()

                surrogates += surrogate_loss.detach()
                trust_region_losses += trust_region_loss.detach()
                entropy_losses += entropy_loss.detach()
                losses += loss.detach()

        steps = self.epochs * (math.ceil(obs.shape[0] / self.n_minibatches))
        loss_dict = {
            "loss": (losses / steps).detach(),
            "policy_loss": (surrogates / steps).detach(),
            "entropy_loss": (entropy_losses / steps).detach(),
            "trust_region_loss": (trust_region_losses / steps).detach()
        }

        if not self.vf_model:
            loss_dict.update({"vf_loss": (vf_losses / steps).detach()})

        if not self.policy.contextual_std and self.projection.proj_type not in [
                "ppo", "papi"
        ]:
            # set policy with projection value without doing regression.
            # In non-contextual cases we have only one cov, so the projection is the same for all samples.
            self.policy.set_std(proj_p[1][0].detach())

        return loss_dict