Exemple #1
0
    def _forward_policy(self, episodes, ratio=False):
        T = episodes.observations.size(0)
        values, log_probs, entropy = [], [], []
        if not self.use_clstm:
            hx = torch.zeros(self.num_workers,
                             self.lstm_size).to(device=self.device)
        else:
            hx = torch.zeros(self.num_workers, self.lstm_size, 7,
                             7).to(device=self.device)

        for t in range(T):
            pi, v, hx = self.policy(episodes.observations[t], hx,
                                    episodes.embeds[t])
            #pi, v = self.policy(episodes.observations[t])
            values.append(v)
            entropy.append(pi.entropy())
            if ratio:
                log_probs.append(
                    pi.log_prob(episodes.actions[t]) - episodes.logprobs[t])
            else:
                log_probs.append(pi.log_prob(episodes.actions[t]))

        log_probs = torch.stack(log_probs)
        values = torch.stack(values)
        entropy = torch.stack(entropy)
        advantages = episodes.gae(values, tau=self.tau)
        advantages = weighted_normalize(advantages, weights=episodes.mask)
        if log_probs.dim() > 2:
            log_probs = torch.sum(log_probs, dim=2)

        return log_probs, advantages, values, entropy
Exemple #2
0
    def compute_advantages(self, baseline, gae_lambda=1.0, normalize=True):
        # Compute the values based on the baseline
        values = baseline(self).detach()
        # Add an additional 0 at the end of values for
        # the estimation at the end of the episode
        values = F.pad(values * self.mask, (0, 0, 0, 1))

        # Compute the advantages based on the values
        deltas = self.rewards + self.gamma * values[1:] - values[:-1]
        self._advantages = torch.zeros_like(self.rewards)
        gae = torch.zeros((self.batch_size, ), dtype=torch.float32)
        for i in range(len(self) - 1, -1, -1):
            gae = gae * self.gamma * gae_lambda + deltas[i]
            self._advantages[i] = gae

        # Normalize the advantages
        if normalize:
            self._advantages = weighted_normalize(self._advantages,
                                                  lengths=self.lengths)
        # Once the advantages are computed, the returns are not necessary
        # anymore (only to compute the parameters of the baseline)
        del self._returns
        del self._mask

        return self.advantages
Exemple #3
0
def test_weighted_normalize():
    lengths = [2, 3, 7, 5, 11]
    # Inputs
    inputs_np = np.random.rand(13, 5).astype(np.float32)

    # Pytorch
    inputs_th = torch.as_tensor(inputs_np)
    normalized_th = weighted_normalize(inputs_th, lengths=lengths)

    for i, length in enumerate(lengths):
        assert (normalized_th[length:, i] == 0.).all()
    def surrogate_loss(self, episodes, old_pis=None):
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            if self.usePPO:
                params, grad_norm = self.adapt_ppo(train_episodes)
            else:
                params = self.adapt(train_episodes)
            self.logger.info("in surrogate_loss")
            with torch.set_grad_enabled(old_pi is None):
                if self.baseline_type == 'critic shared':
                    pi, _ = self.policy(valid_episodes.observations,
                                        params=params)
                pi = self.policy(valid_episodes.observations, params=params)
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                if self.baseline_type == 'linear':
                    values = self.baseline(valid_episodes)
                elif self.baseline_type == 'critic separate':
                    values = self.baseline(valid_episodes.observations)
                elif self.baseline_type == 'critic shared':
                    _, values = self.policy(valid_episodes.observations,
                                            params=params)

                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask)

                log_ratio = (pi.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)

                loss = -weighted_mean(
                    ratio * advantages, dim=0, weights=valid_episodes.mask)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(pi, old_pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)

        return (torch.mean(torch.stack(losses, dim=0)),
                torch.mean(torch.stack(kls, dim=0)), pis)
Exemple #5
0
def val(args, sampler_val, policy, baseline, batch):
    start_time = time.time()

    from maml_rl.utils.torch_utils import weighted_normalize, weighted_mean
    tasks_val = sampler_val.sample_tasks()
    task_to_episodes = dict()
    for task in tasks_val:
        task_episodes = []
        sampler_val.reset_task(task)
        for i_episode in range(args.num_adapt_val + 1):
            if i_episode == 0:
                params = None
            episodes = sampler_val.sample(policy,
                                          params=params,
                                          gamma=args.gamma,
                                          device=args.device)

            # compute inner loss
            baseline.fit(episodes)
            values = baseline(episodes)
            advantages = episodes.gae(values, tau=args.tau)
            advantages = weighted_normalize(advantages, weights=episodes.mask)

            pi = policy(episodes.observations, params=params)
            log_probs = pi.log_prob(episodes.actions)
            if log_probs.dim() > 2:
                log_probs = torch.sum(log_probs, dim=2)
            entropy = pi.entropy().mean()
            loss = -weighted_mean(
                log_probs * advantages, dim=0,
                weights=episodes.mask) - args.entropy_coef_val * entropy
            fast_lr = args.fast_lr if i_episode == 0 else args.fast_lr_val_after_one
            if i_episode <= args.num_adapt_val:
                params = policy.update_params(loss,
                                              step_size=fast_lr,
                                              first_order=True)
            task_episodes.append(episodes)
        task_to_episodes[str(task)] = task_episodes

    for i_episode in range(args.num_adapt_val + 1):
        returns = calculate_returns([
            task_episodes[i_episode].rewards
            for task_episodes in task_to_episodes.values()
        ])
        logger.logkv(f'val_return_avg_adapt{i_episode}', returns.mean().item())
        logger.logkv(f'val_return_std_adapt{i_episode}', returns.std().item())

    logger.logkv('val_time', time.time() - start_time)

    save_dir = os.path.join(args.log_dir, 'val')
    os.makedirs(save_dir, exist_ok=True)
    pickle.dump(task_to_episodes,
                open(os.path.join(save_dir, f'val_{batch}.pkl'), 'wb'))
Exemple #6
0
    def surrogate_loss(self, episodes, old_pis=None):
        losses, kls, action_dists, critic_losses = [], [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            policy_params, critic_params = self.adapt(train_episodes)
            with torch.set_grad_enabled(old_pi is None):
                action_dist = self.policy(valid_episodes.observations,
                                          params=policy_params)
                action_dists.append(detach_distribution(action_dist))

                if old_pi is None:
                    old_pi = detach_distribution(action_dist)

                values = self.critic(valid_episodes.observations,
                                     params=critic_params)
                advantages = valid_episodes.gae(values, tau=self.tau)
                value_loss = weighted_mean(advantages.pow(2),
                                           dim=0,
                                           weights=valid_episodes.mask)
                critic_losses.append(value_loss)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask,
                                                epsilon=1e-5)

                log_ratio = (action_dist.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)

                loss = -weighted_mean(ratio * advantages.detach(),
                                      dim=0,
                                      weights=valid_episodes.mask)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(action_dist, old_pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)

        return (torch.mean(torch.stack(losses, dim=0)),
                torch.mean(torch.stack(kls, dim=0)), action_dists,
                torch.mean(torch.stack(critic_losses, dim=0)))
    def surrogate_loss(self, episodes, old_pis=None):
        """
        Using TRPO.

        old_pis are not None only when doing line search?
        How are old_pis used? Like the behavior policy in TRPO? How?
        """
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            # adapt our policy network to a new task
            params = self.adapt(train_episodes)
            # doing learning only when old_pi is None?
            with torch.set_grad_enabled(old_pi is None):
                pi = self.policy(valid_episodes.observations, params=params)
                # the set of policies adapted to each task
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                values = self.baseline(valid_episodes)
                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask)

                log_ratio = (pi.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)

                loss = -weighted_mean(
                    ratio * advantages, dim=0, weights=valid_episodes.mask)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(pi, old_pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)

        return (torch.mean(torch.stack(losses, dim=0)),
                torch.mean(torch.stack(kls, dim=0)), pis)
    def inner_loss(self, episodes, params=None):
        """Compute the inner loss for the one-step gradient update. The inner 
        loss is REINFORCE with baseline [2], computed on advantages estimated 
        with Generalized Advantage Estimation (GAE, [3]).
        """
        values = self.baseline(episodes)
        advantages = episodes.gae(values, tau=self.tau)
        advantages = weighted_normalize(advantages, weights=episodes.mask)

        pi = self.policy(episodes.observations, params=params)
        log_probs = pi.log_prob(episodes.actions)
        if log_probs.dim() > 2:
            log_probs = torch.sum(log_probs, dim=2)
        loss = -weighted_mean(log_probs * advantages, dim=0)

        return loss
    def surrogate_loss(self, episodes, old_pis=None):
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            params = self.adapt(train_episodes)
            with torch.set_grad_enabled(old_pi is None):
                pi = self.policy(valid_episodes.observations, params=params)
                # detach the mu, scale parameters of distribution pi, no gradients, no update to them
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                values = self.baseline(valid_episodes)
                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask)
                # initial 0, changed in line search process as pi changed, old_pi not changed
                log_ratio = (pi.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                # print('log_ratio: ',log_ratio)
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)
                # print('ratio: ', ratio)
                # print('advantages: ', advantages)
                loss = -weighted_mean(
                    ratio * advantages, dim=0, weights=valid_episodes.mask)
                # the weighted_mean loss is very samll, e-8 magnitude
                print('loss: ', loss)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(pi, old_pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)

        return (torch.mean(torch.stack(losses, dim=0)),
                torch.mean(torch.stack(kls, dim=0)), pis)
    def surrogate_loss(self, episodes, old_pis=None):
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            params = self.adapt(train_episodes)
            with torch.set_grad_enabled(True):
                pi = self.policy(valid_episodes.observations, params=params)
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                values = self.baseline(valid_episodes)
                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask)

                log_ratio = (pi.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)

                loss_clipped = ratio.clamp(1.0 - self.ppo_ratio,
                                           1.0 + self.ppo_ratio) * advantages
                loss = ratio * advantages

                loss = -torch.min(loss, loss_clipped)

                loss = weighted_mean(loss, dim=0, weights=valid_episodes.mask)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(old_pi, pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)
                losses.append(loss + kl * 0.0005)

        return (torch.mean(torch.stack(losses, dim=0)),
                torch.mean(torch.stack(kls, dim=0)), pis)
Exemple #11
0
    def inner_loss(self, episodes, params=None):
        """Compute the inner loss for the one-step gradient update. The inner 
        loss is REINFORCE with baseline [2], computed on advantages estimated 
        with Generalized Advantage Estimation (GAE, [3]).
        The baseline is subtracted from the empirical return to reduce
        variance of the optimization. In here, a linear function as the 
        baseline with a time-varying feature vector is used.
        """
        values = self.baseline(episodes)
        advantages = episodes.gae(values, tau=self.tau)
        advantages = weighted_normalize(advantages, weights=episodes.mask)

        pi = self.policy(episodes.observations, params=params)
        log_probs = pi.log_prob(episodes.actions)
        if log_probs.dim() > 2:
            log_probs = torch.sum(log_probs, dim=2)
        loss = -weighted_mean(log_probs * advantages, dim=0, weights=episodes.mask)

        return loss
    def inner_loss(self, episodes, l_params=None, h_params=None):
        """Compute the inner loss for the one-step gradient update. The inner
        loss is REINFORCE with baseline [2], computed on advantages estimated
        with Generalized Advantage Estimation (GAE, [3]).
        """
        values = self.baseline(episodes)
        advantages = episodes.gae(values, tau=self.tau)
        advantages = weighted_normalize(advantages, weights=episodes.mask)

        # First we calculate the latent space actions from the higher level policy (stored in episodes).
        # Then we calculate the lower level actions using the higher level actions
        pi_higher = self.h_policy(episodes.observations, params=h_params)
        # Calculate the log probability
        log_probs = pi_higher.log_prob(episodes.higher_level_actions)
        if log_probs.dim() > 2:
            log_probs = torch.sum(log_probs, dim=2)
        loss = -weighted_mean(log_probs * advantages, dim=0)

        return loss
Exemple #13
0
    def surrogate_loss(self, episodes, old_pis=None):
        """Computes the surrogate loss in TRPO:
        (pi(a|s) / q(a|s)) * Q(s,a) in Eqn 14
        Because the meta-loss tried to find theta that minimizes
        loss with phi, the loss is computed with valid episodes  
        """
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis):
            params = self.adapt(train_episodes)

            with torch.set_grad_enabled(old_pi is None):
                pi = self.policy(valid_episodes.observations, params=params)
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                values = self.baseline(valid_episodes)
                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages, weights=valid_episodes.mask)

                log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)  # Convert back to ratio from log

                loss = -weighted_mean(ratio * advantages, dim=0, weights=valid_episodes.mask)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask)
                kls.append(kl)

        return (
            torch.mean(torch.stack(losses, dim=0)),
            torch.mean(torch.stack(kls, dim=0)), 
            pis)
    def inner_loss(self, episodes, params=None):
        """Compute the inner loss for the one-step gradient update. The inner
        loss is REINFORCE with baseline [2], computed on advantages estimated
        with Generalized Advantage Estimation (GAE, [3]).
        https://pytorch.org/docs/0.3.1/distributions.html (except using advantag
        instead of rewards.) Implements eq 4.
        """

        vf_loss = -1
        loss = 0
        if self.baseline_type == 'linear':
            values = self.baseline(episodes)
        elif self.baseline_type == 'critic separate':
            values = self.baseline(episodes.observations)
            # find value loss sum [(R-V(s))^2]
            R = episodes.returns.view([200, 20, 1])
            vf_loss = (((values - R)**2).mean())**(1 / 2)
        #else:
        #    pi,values = self.policy(episodes.observations)
        #    pi,vi = self.policy(episodes.observations,params=params)
        #    log_probs = pi.log_prob(values.size())
        #    loss = (((values - R) ** 2).mean()) ** (1 / 2)

        advantages = episodes.gae(values, tau=self.tau)
        advantages_unnorm = advantages
        sum_adv = torch.sum(advantages_unnorm).numpy()
        logging.info("unnormalized advantages: " + str(sum_adv))
        logging.info("sum of returns:" + str(torch.sum(episodes.returns)))

        advantages = weighted_normalize(advantages, weights=episodes.mask)

        pi = self.policy(episodes.observations, params=params)
        log_probs = pi.log_prob(episodes.actions)
        if log_probs.dim() > 2:
            # sum over all the workers.
            log_probs = torch.sum(log_probs, dim=2)
        loss = loss - weighted_mean(
            log_probs * advantages, dim=0, weights=episodes.mask)
        logging.info("inner loss: " + str(loss))

        return loss, vf_loss
Exemple #15
0
    def surrogate_loss(self, episodes, old_pis=None):
        """
        Surrogate objective:
        E_r SmoothReLU( V_r^{adapted self.policy} - \max_{\pi \in self.policies[0:policy_idx - 1} V_r^\pi)

        V_r^{adapted self.policy} can be evaluated by valid_episodes in episodes
        \max_{\pi \in self.policies[0:policy_idx - 1} V_r^\pi is computed in self.values_of_optimized_policies

        :param episodes: [(episodes before adapting, episodes after adapting) for task in sampled tasks]
        :param old_pis: dummy parameter derived from super
        :return: mean of losses, mean of kls, pis
        """
        losses, kls, pis = [], [], []
        if old_pis is None:
            old_pis = [None] * len(episodes)

        for episode_index in range(len(episodes)):
            (train_episodes, valid_episodes) = episodes[episode_index]
            old_pi = old_pis[episode_index]

            if self.current_policy_idx == 0:
                dominance_correction = 1
            else:
                difference_from_best_value = total_rewards(
                    valid_episodes.rewards
                ) - self.values_of_optimized_policies[episode_index]
                dominance_correction = 1 - 1 / (
                    1 + math.exp(difference_from_best_value))

            params = self.adapt(train_episodes)
            with torch.set_grad_enabled(old_pi is None):
                pi = self.policy(valid_episodes.observations, params=params)
                pis.append(detach_distribution(pi))

                if old_pi is None:
                    old_pi = detach_distribution(pi)

                values = self.baseline(valid_episodes)
                advantages = valid_episodes.gae(values, tau=self.tau)
                advantages = weighted_normalize(advantages,
                                                weights=valid_episodes.mask)

                log_ratio = (pi.log_prob(valid_episodes.actions) -
                             old_pi.log_prob(valid_episodes.actions))
                if log_ratio.dim() > 2:
                    log_ratio = torch.sum(log_ratio, dim=2)
                ratio = torch.exp(log_ratio)

                loss = -dominance_correction * weighted_mean(
                    ratio * advantages, dim=0, weights=valid_episodes.mask)
                losses.append(loss)

                mask = valid_episodes.mask
                if valid_episodes.actions.dim() > 2:
                    mask = mask.unsqueeze(2)
                kl = weighted_mean(kl_divergence(pi, old_pi),
                                   dim=0,
                                   weights=mask)
                kls.append(kl)

        if len(losses) == 0 or len(kls) == 0:
            # signal outside that no losses. avoiding taking mean of empty tensors..
            return (None, None, pis)
        else:
            return (torch.mean(torch.stack(losses, dim=0)),
                    torch.mean(torch.stack(kls, dim=0)), pis)
Exemple #16
0
def main(args):
    np.random.seed(RANDOM_SEED)

    save_folder = './saves/{0}'.format(args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    sampler = MrcBatchSampler(args.env_name,
                              batch_size=args.fast_batch_size,
                              train_folder=TRAIN_TRACES)

    policy = ActorNet(input_size=[S_INFO, S_LEN],
                      output_size=A_DIM,
                      learning_rate=ACTOR_LR_RATE)
    baseline = CriticNet(input_size=[S_INFO, S_LEN],
                         output_size=A_DIM,
                         learning_rate=CRITIC_LR_RATE)

    # metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma,
    #     fast_lr=args.fast_lr, tau=args.tau, device=args.device)

    for batch in range(args.num_batches):
        print(
            "==================================================================="
        )
        print("=====================Now epoch: ", batch,
              "========================")

        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        sampler.reset_task(0.5)
        episodes = sampler.sample(policy, gamma=args.gamma, device=args.device)

        rewards = np.array(episodes.rewards)
        rewards = rewards.sum(0)
        mean_reward = rewards.mean()

        entropys = np.array(episodes.entropys).sum(0)
        mean_entropy = entropys.mean()

        values = baseline(episodes.observations)
        advantages = episodes.gae(values, tau=1)
        advantages = weighted_normalize(advantages, weights=episodes.mask)
        advantages = np.array(advantages).sum(0)
        mean_ad = advantages.mean()
        print("  mean Ad: ", mean_ad, "  mean reward:", mean_reward,
              "  mean_entropy: ", mean_entropy)

        # episodes = metalearner.sample(tasks, first_order=args.first_order)
        # metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters,
        #     cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps,
        #     ls_backtrack_ratio=args.ls_backtrack_ratio)

        baseline.fit(episodes, CRITIC_LR_RATE)
        policy.fit(episodes, baseline, ACTOR_LR_RATE)

        if not batch % 4:
            noMetaTest(policy, baseline, batch)
            # print("total_rewards", total_rewards([ep.rewards for ep in episodes]))
            # print('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]))
            # for taskid in range(args.meta_bath_size):
            #     before = episodes[0][0].rewards
            #     print()

            # metaTest(policy, baseline, batch)

            # Save policy network
            with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)),
                      'wb') as f:
                torch.save(policy.state_dict(), f)

            with open(
                    os.path.join(save_folder, 'baseline-{0}.pt'.format(batch)),
                    'wb') as f:
                torch.save(baseline.state_dict(), f)
Exemple #17
0
    def compute_ng_gradient(self,
                            episodes,
                            max_kl=1e-3,
                            cg_iters=20,
                            cg_damping=1e-2,
                            ls_max_steps=10,
                            ls_backtrack_ratio=0.5):
        ng_grads = []
        for train_episodes, valid_episodes in episodes:
            params, step_size, step = self.adapt(train_episodes)

            # compute $grad = \nabla_x J^{lvc}(x) at x = \theta - \eta\UM(\theta)
            pi = self.policy(valid_episodes.observations, params=params)
            pi_detach = detach_distribution(pi)

            values = self.baseline(valid_episodes)
            advantages = valid_episodes.gae(values, tau=self.tau)
            advantages = weighted_normalize(advantages,
                                            weights=valid_episodes.mask)

            log_ratio = pi.log_prob(
                valid_episodes.actions) - pi_detach.log_prob(
                    valid_episodes.actions)
            if log_ratio.dim() > 2:
                log_ratio = torch.sum(log_ratio, dim=2)
            ratio = torch.exp(log_ratio)

            loss = -weighted_mean(
                ratio * advantages, dim=0, weights=valid_episodes.mask)

            ng_grad_0 = torch.autograd.grad(
                loss, self.policy.parameters())  # no create graph
            ng_grad_0 = parameters_to_vector(ng_grad_0)
            # compute the inverse of Fihser matrix at x=\theta times $grad with Conjugate Gradient
            hessian_vector_product = self.hessian_vector_product_ng(
                train_episodes, damping=cg_damping)
            F_inv_grad = conjugate_gradient(hessian_vector_product,
                                            ng_grad_0,
                                            cg_iters=cg_iters)

            # compute $ng_grad_1 = \nabla^2 J^{lvc}(x) at x = \theta times $F_inv_grad
            # create graph for higher differential
            # self.baseline.fit(train_episodes)
            loss = self.inner_loss(train_episodes)
            grad = torch.autograd.grad(loss,
                                       self.policy.parameters(),
                                       create_graph=True)
            grad = parameters_to_vector(grad)
            grad_F_inv_grad = torch.dot(grad, F_inv_grad.detach())
            ng_grad_1 = torch.autograd.grad(grad_F_inv_grad,
                                            self.policy.parameters())
            ng_grad_1 = parameters_to_vector(ng_grad_1)
            # compute $ng_grad_2 = the Jocobian of {F(x) U(\theta)} at x = \theta times $F_inv_grad
            hessian_vector_product = self.hessian_vector_product_ng(
                train_episodes, damping=cg_damping)
            F_U = hessian_vector_product(step)
            ng_grad_2 = torch.autograd.grad(
                torch.dot(F_U, F_inv_grad.detach()), self.policy.parameters())
            ng_grad_2 = parameters_to_vector(ng_grad_2)
            ng_grad = ng_grad_0 - step_size * (ng_grad_1 + ng_grad_2)

            ng_grad = parameters_to_vector(ng_grad)
            ng_grads.append(ng_grad.view(len(ng_grad), 1))

        return torch.mean(torch.stack(ng_grads, dim=1), dim=[1, 2])
    def inner_loss_ppo_noUpdate(self,
                                episodes,
                                first_order,
                                params=None,
                                ent_coef=0,
                                vf_coef=0,
                                nenvs=1):
        """Compute the inner loss for the one-step gradient update. The inner
        loss is PPO with clipped ratio = new_pi/old_pi.
        Can make cliprange adaptable.
        nenvs = number of workers. nsteps defined in env
        """
        #episodes = [num of steps, num of episodes, obs_space]
        #NEED TO CHANGE ADVANTAGE CALCULATION TO CRITIC.
        losses = []

        self.logger.info("cliprange: " + str(self.cliprange) +
                         "; noptepochs: " + str(self.noptepochs) +
                         ";nminibaches: " + str(self.nminibatches) +
                         ";ppo_lr: " + str(self.ppo_lr))
        # Save the old parameters
        old_policy = copy.deepcopy(self.policy)
        old_params = parameters_to_vector(old_policy.parameters())

        #Need to take mini-batch of sampled examples to do gradient update a few times.
        nepisodes = episodes.observations.shape[1]
        nsteps = episodes.observations.shape[0]
        nbatch = nenvs * nsteps * nepisodes
        nbatch_train = nbatch // self.nminibatches
        mblossvals = []

        #Flattern the episode to [steps, observations]
        episodes_flat = BatchEpisodes(batch_size=nbatch)
        i = 0
        for ep in range(nepisodes):
            for step in range(nsteps):
                episodes_flat.append([episodes.observations[step][ep].numpy()],
                                     [episodes.actions[step][ep].numpy()],
                                     [episodes.returns[step][ep].numpy()],
                                     (i, ))
                i += 1

        inds = np.arange(nbatch)

        # For the case with linear baseline.
        vf_loss = -1

        for epoch in range(self.noptepochs):

            # Randomize the indexes
            #np.random.shuffle(inds)
            mb_vf_loss = torch.zeros(1)
            grad_norm = []
            # 0 to batch_size with batch_train_size step
            for start in range(0, nbatch, nbatch_train):

                mb_obs, mb_returns, mb_masks, mb_actions = [], [], [], []
                mb_episodes = BatchEpisodes(batch_size=nbatch_train)

                end = start + nbatch_train
                mbinds = inds[start:end]

                for i in range(len(mbinds)):
                    mb_obs.append(
                        episodes_flat.observations[0][mbinds[i]].numpy())
                    mb_returns.append(
                        episodes_flat.returns[0][mbinds[i]].numpy())
                    mb_masks.append(episodes_flat.mask[0][mbinds[i]].numpy())
                    mb_actions.append(
                        episodes_flat.actions[0][mbinds[i]].numpy())
                    mb_episodes.append([mb_obs[i]], [mb_actions[i]],
                                       [mb_returns[i]], (i, ))

                if self.baseline_type == 'linear':
                    values = self.baseline(mb_episodes)
                elif self.baseline_type == 'critic separate':
                    values = self.baseline(mb_episodes.observations)
                    # find value loss sum [(R-V(s))^2]
                    R = torch.FloatTensor(np.array(mb_returns))
                    mb_vf_loss = (((values - R)**2).mean()) + mb_vf_loss

                #values = self.baseline(mb_episodes)

                advantages = mb_episodes.gae(values, tau=self.tau)
                advantages_unnorm = advantages
                advantages = weighted_normalize(advantages.type(torch.float32),
                                                weights=torch.ones(
                                                    1, advantages.shape[1]))

                mb_returns_sum = np.sum(mb_returns)
                self.logger.info("iter: " + "epoch:" + str(epoch) + "; mb:" +
                                 str(start / nbatch_train))
                self.logger.info("mb returns: " + str(mb_returns_sum))

                pi = self.policy(mb_episodes.observations)
                log_probs = pi.log_prob(mb_episodes.actions)

                #reload old policy.
                vector_to_parameters(old_params, old_policy.parameters())
                pi_old = old_policy(mb_episodes.observations)

                log_probs_old = pi_old.log_prob(mb_episodes.actions)

                if log_probs.dim() > 2:
                    log_probs_old = torch.sum(log_probs_old, dim=2)
                    log_probs = torch.sum(log_probs, dim=2)

                ratio = torch.exp(log_probs - log_probs_old)

                self.logger.info("max pi: ")
                self.logger.info(torch.max(pi.mean))

                for x in ratio[0][:10]:
                    if x > 1E5 or x < 1E-5:
                        #pdb.set_trace()
                        self.logger.info("ratio too large or too small.")
                        self.logger.info(ratio[0][:10])

                self.logger.info("policy ratio: ")
                self.logger.info(ratio[0][:10])

                #loss function
                pg_losses = -advantages * ratio
                pg_losses2 = -advantages * torch.clamp(
                    ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)

                # Final PG loss
                pg_loss = weighted_mean(torch.max(pg_losses, pg_losses2),
                                        weights=torch.ones(
                                            1, advantages.shape[1]))

                self.logger.debug("policy mu weights: ")
                self.logger.debug(self.policy.mu.weight)

                sum_adv = torch.sum(advantages_unnorm).numpy()
                self.logger.info("unnormalized advantages: " + str(sum_adv))

                # Total loss
                loss = pg_loss

                self.logger.info("max_action: " + str(np.max(mb_actions)))
                self.logger.info("max_action index: " +
                                 str(np.argmax(mb_actions)))

                # Save the old parameters
                old_params = parameters_to_vector(self.policy.parameters())
                losses.append(loss)

        self.logger.info("inner loss for each mb and epoch: ")
        self.logger.info(mblossvals)
        return torch.mean(torch.stack(losses, dim=0))