Esempio n. 1
0
    def learn_on_batch(self, postprocessed_batch):
        # Set Model to train mode.
        if self.model:
            self.model.train()

        for k, v in postprocessed_batch.items():
            if 'state_in' in k[:8]:
                # assume all traj has the same length
                postprocessed_batch[k] = np.tile(
                    v, (postprocessed_batch.count // v.shape[0], 1))
            # print(k, len(postprocessed_batch[k]))
        postprocessed_batch.seq_lens = None  # remove to use .copy()

        c = 0
        for ep in range(self.ppo_epochs):
            for mb in minibatches(postprocessed_batch, self.minibatch_size):
                c += 1
                # pad batch for rnn
                pad_batch_to_sequences_of_same_size(
                    mb,
                    max_seq_len=self.max_seq_len,
                    shuffle=False,
                    batch_divisibility_req=self.batch_divisibility_req,
                    view_requirements=self.view_requirements,
                )
                mb["is_training"] = True
                # minibatch = mb.copy()
                mb['advantages'] = standardize(mb['advantages'])
                minibatch = self._lazy_tensor_dict(mb)
                # compute the loss
                loss = ppo_surrogate_loss(self, self.model, self.dist_class,
                                          minibatch)
                # compute gradient
                self.optimizer.zero_grad()
                loss.backward()
                # grad norm
                # apply_grad_clipping(self, self.optimizer, loss)
                if self.config['grad_clip']:
                    grad_norm = nn.utils.clip_grad_norm_(
                        self.model.parameters(), self.config['grad_clip'])
                self.optimizer.step()
                # log stats
                # stats['ppo_loss'] += loss_out.item()
        # stats['ppo_loss'] /= c
        # add more info about the loss
        # TODO: move this to inner loop and use average instead (0)
        # stats.update(kl_and_loss_stats(self, train_batch))
        # compute the loss

        # log stats
        # stats = {
        #     "loss": loss_out.item(),
        #     'test': 1
        #     # "grad_norm": grad_norm
        #     # if isinstance(grad_norm, float) else grad_norm.item(),
        # }
        # TODO: move this to inner loop and use average instead (0)

        stats = kl_and_loss_stats(self, postprocessed_batch)
        return {LEARNER_STATS_KEY: stats}
Esempio n. 2
0
def loss(policy, model, dist_class, train_batch):

    surrogate_loss = ppo_surrogate_loss(policy, model, dist_class, train_batch)
    #return surrogate_loss
    return model.custom_loss(surrogate_loss, train_batch)
Esempio n. 3
0
    def learn_on_batch(self, postprocessed_batch):
        # if isinstance(postprocessed_batch, SampleBatch):
        #     postprocessed_batch = MultiAgentBatch({DEFAULT_POLICY_ID: postprocessed_batch},
        #                                           postprocessed_batch.count)

        # postprocessed_batch = postprocessed_batch.policy_batches[DEFAULT_POLICY_ID]

        # print(type(postprocessed_batch))
        # print(postprocessed_batch.keys())
        # print(postprocessed_batch['agent_index'])
        # print(postprocessed_batch['seq_lens'])

        # Set Model to train mode.
        if self.model:
            self.model.train()
        # Turn the values into tensors

        # train_batch = self._lazy_tensor_dict(postprocessed_batch)
        # print(postprocessed_batch.keys())
        # print(postprocessed_batch['agent_index'].shape)
        # print(train_batch['obs'].shape)
        # print(train_batch[''])
        # stats = {}
        rew = defaultdict(float)
        traj = {}
        # print(postprocessed_batch.count)
        for k, v in postprocessed_batch.items():
            if 'state_in' in k[:8]:
                # assume all traj has the same length
                postprocessed_batch[k] = np.tile(
                    v, (postprocessed_batch.count // v.shape[0], 1))
            # print(k, len(postprocessed_batch[k]))
        postprocessed_batch.seq_lens = None  # remove to use split_by_episode
        # very slow
        # need a way to get traj from a specific partner faster
        # could try split_by_episode
        # for i,row in enumerate(postprocessed_batch.rows()):
        #     # print(i)
        #     # print(row['state_in_0'])
        #     # print(row['state_out_0'])

        #     partner_id = tuple(row['partner_id'])
        #     # print(partner_id)
        #     # print(row['rewards'])
        #     rew[partner_id] += row['rewards']
        #     for k,v in row.items():
        #         row[k] = [v]
        #     row = SampleBatch(row)
        #     if partner_id not in traj:
        #         traj[partner_id] = row
        #     else:
        #         # print('concat',i)
        #         traj[partner_id] = traj[partner_id].concat(row)

        for i, ep in enumerate(postprocessed_batch.split_by_episode()):
            # print(i)
            # print(ep['state_in_0'])
            # print(ep['state_out_0'])

            # assume a fixed set of partners in one episode
            partner_id = tuple(ep['partner_id'][0])
            # print(partner_id)
            # print(ep['rewards'])
            rew[partner_id] += sum(ep['rewards'])
            # for k,v in ep.items():
            #     ep[k] = [v]
            # ep = SampleBatch(ep)
            if partner_id not in traj:
                traj[partner_id] = ep
            else:
                # print('concat',i)
                traj[partner_id] = traj[partner_id].concat(ep)

        rew_list = list(rew.items())
        rew_list.sort(key=lambda x: x[1])
        lowest_rew_partner = rew_list[0][0]
        # print(rew_list)
        train_traj = traj[lowest_rew_partner]
        # print(train_traj.count)
        stats = {'timesteps_used': train_traj.count}
        # print('traj.seq_lens:', train_traj.seq_lens)
        c = 0
        for ep in range(self.ppo_epochs):
            # batch.shuffle()
            for mb in minibatches(train_traj, self.minibatch_size):
                c += 1
                # minibatch = MultiAgentBatch({DEFAULT_POLICY_ID: mb}, mb.count)
                # minibatch = mb.copy()
                pad_batch_to_sequences_of_same_size(
                    mb,
                    max_seq_len=self.max_seq_len,
                    shuffle=False,
                    batch_divisibility_req=self.batch_divisibility_req,
                    view_requirements=self.view_requirements,
                )
                # print(mb.seq_lens)
                # for k in mb.keys():
                #     print(k, len(mb[k]))
                # minibatch = mb.copy()
                # minibatch['advantages'] = standardize(minibatch['advantages'])
                mb["is_training"] = True
                # minibatch = mb.copy()
                mb['advantages'] = standardize(mb['advantages'])
                minibatch = self._lazy_tensor_dict(mb)
                # compute the loss
                loss = ppo_surrogate_loss(self, self.model, self.dist_class,
                                          minibatch)
                # compute gradient
                self.optimizer.zero_grad()
                loss.backward()
                # grad norm
                # apply_grad_clipping(self, self.optimizer, loss)
                if self.config['grad_clip']:
                    grad_norm = nn.utils.clip_grad_norm_(
                        self.model.parameters(), self.config['grad_clip'])
                self.optimizer.step()
                # log stats
                # stats['ppo_loss'] += loss_out.item()
        # stats['ppo_loss'] /= c
        # add more info about the loss
        # TODO: move this to inner loop and use average instead (0)
        # stats.update(kl_and_loss_stats(self, train_batch))
        # compute the loss

        # log stats
        # stats = {
        #     "loss": loss_out.item(),
        #     'test': 1
        #     # "grad_norm": grad_norm
        #     # if isinstance(grad_norm, float) else grad_norm.item(),
        # }
        # TODO: move this to inner loop and use average instead (0)

        stats.update(kl_and_loss_stats(self, postprocessed_batch))
        return {LEARNER_STATS_KEY: stats}
Esempio n. 4
0
    def learn_on_batch(self, train_batch):
        # print(type(train_batch))
        # Turn the values into tensors
        # train_batch_tensor = self._lazy_tensor_dict(train_batch)
        # train_batch_tensor = train_batch_tensor
        # restore_original_dimensions()
        # print(train_batch_tensor.keys())
        # update the skill dynamics

        # Set Model to train mode.
        if self.model:
            self.model.train()
        if self.dynamics:
            self.dynamics.train()

        stats = defaultdict(int)
        if self.use_dynamics:
            c = 0
            for ep in range(self.dynamics_epochs):
                for mb in minibatches(
                        train_batch, self.minibatch_size
                ):  # minibatches(train_batch.copy(), self.minibatch_size)
                    c += 1
                    mb["is_training"] = True
                    minibatch = self._lazy_tensor_dict(mb)

                    obs = _unpack_obs(minibatch['obs'],
                                      self.model.options['orig_obs_space'],
                                      torch)
                    next_obs = _unpack_obs(
                        minibatch['new_obs'],
                        self.model.options['orig_obs_space'], torch)
                    dynamics_obs = obs['dynamics_obs']
                    next_dynamics_obs = next_obs['dynamics_obs'] - obs[
                        'dynamics_obs']
                    z = obs['z']

                    log_prob = self.dynamics.get_log_prob(dynamics_obs,
                                                          z,
                                                          next_dynamics_obs,
                                                          training=True)
                    dynamics_loss = -torch.mean(log_prob)
                    orth_loss = self.dynamics.orthogonal_regularization()
                    l2_loss = self.dynamics.l2_regularization()
                    if self.config['dynamics_orth_reg']:
                        dynamics_loss += orth_loss
                    if self.config['dynamics_l2_reg'] and not self.config[
                            'dynamics_spectral_norm']:
                        dynamics_loss += l2_loss
                    self.dynamics_opt.zero_grad()
                    dynamics_loss.backward()
                    if self.config['grad_clip']:
                        grad_norm = nn.utils.clip_grad_norm_(
                            self.dynamics.parameters(),
                            self.config['grad_clip'])
                    self.dynamics_opt.step()
                    stats['dynamics_loss'] += dynamics_loss.item()
                    stats['orth_loss'] += orth_loss.item()
                    stats['l2_loss'] += l2_loss.item()
            stats['dynamics_loss'] /= c
            stats['orth_loss'] /= c
            stats['l2_loss'] /= c

            self.dynamics.eval()
            # compute intrinsic reward
            with torch.no_grad():
                batch = self._lazy_tensor_dict(train_batch)
                obs = _unpack_obs(batch['obs'],
                                  self.model.options['orig_obs_space'], torch)
                next_obs = _unpack_obs(batch['new_obs'],
                                       self.model.options['orig_obs_space'],
                                       torch)
                z = obs['z']
                dynamics_obs = obs['dynamics_obs']
                next_dynamics_obs = next_obs['dynamics_obs'] - obs[
                    'dynamics_obs']

                dads_reward, info = self.dynamics.compute_reward(
                    dynamics_obs, z, next_dynamics_obs)
                dads_reward = self.config[
                    'dads_reward_scale'] * dads_reward.numpy()
                # # replace the reward column in train_batch
                # print(train_batch['rewards'].shape)
                train_batch['rewards'] = dads_reward
                stats['avg_dads_reward'] = dads_reward.mean()
                stats['num_skills_higher_prob'] = info['num_higher_prob']

        # calculate GAE for dads reward here?
        trajs = train_batch.split_by_episode()
        processed_trajs = []
        for traj in trajs:
            processed_trajs.append(compute_gae_for_sample_batch(self, traj))
        batch = SampleBatch.concat_samples(processed_trajs)

        # train_batch = compute_gae_for_sample_batch(self, self._lazy_numpy_dict(train_batch))
        # train_batch = self._lazy_tensor_dict(train_batch)
        # update agent using RL algo
        # split to minibatches
        c = 0
        for ep in range(self.ppo_epochs):
            # batch.shuffle()
            for mb in minibatches(batch, self.minibatch_size):
                c += 1
                mb["is_training"] = True
                # minibatch = mb.copy()
                mb['advantages'] = standardize(mb['advantages'])
                minibatch = self._lazy_tensor_dict(mb)
                # compute the loss
                loss_out = ppo_surrogate_loss(self, self.model,
                                              self.dist_class, minibatch)
                # compute gradient
                self.ppo_opt.zero_grad()
                # the learning_rate is already used in ppo_surrogate_loss
                loss_out.backward()
                # grad norm
                if self.config['grad_clip']:
                    grad_norm = nn.utils.clip_grad_norm_(
                        self.model.parameters(), self.config['grad_clip'])
                self.ppo_opt.step()
                # log stats
                stats['ppo_loss'] += loss_out.item()
        stats['ppo_loss'] /= c
        # add more info about the loss
        stats.update(kl_and_loss_stats(self, train_batch))

        #  {
        #     "loss": loss_out.item(),
        #     'test': 1
        #     # "grad_norm": grad_norm
        #     # if isinstance(grad_norm, float) else grad_norm.item(),
        # }
        return {LEARNER_STATS_KEY: stats}
Esempio n. 5
0
def ppo_loss(policy: Policy, model: ModelV2,
             dist_class: Type[TorchDistributionWrapper],
             train_batch: SampleBatch) -> Union[TensorType, List[TensorType]]:
    """ TODO: Write documentation.
    """
    # Compute original ppo loss
    total_loss = ppo_surrogate_loss(policy, model, dist_class, train_batch)

    # Shallow copy the input batch.
    # Be careful accessing fields using the original batch to properly
    # keep track of acessed keys, which will be used to discard useless
    # components of policy's view requirements.
    train_batch_copy = train_batch.copy(shallow=True)

    # Extract mean of predicted action from logits.
    # No need to compute the perform model forward pass since the original
    # PPO loss is already doing it, so just getting back the last ouput.
    action_logits = model._last_output
    if issubclass(dist_class, TorchDiagGaussian):
        action_mean_true, _ = torch.chunk(action_logits, 2, dim=1)
    else:
        action_dist = dist_class(action_logits, model)
        action_mean_true = action_dist.deterministic_sample()

    if policy.config["caps_temporal_reg"] > 0.0:
        # Compute the mean action corresponding to the previous observation
        observation_prev = train_batch["_prev_obs"]
        train_batch_copy["obs"] = observation_prev
        action_logits_prev, _ = model(train_batch_copy)
        if issubclass(dist_class, TorchDiagGaussian):
            action_mean_prev, _ = torch.chunk(action_logits_prev, 2, dim=1)
        else:
            action_dist_prev = dist_class(action_logits_prev, model)
            action_mean_prev = action_dist_prev.deterministic_sample()

        # Minimize the difference between the successive action mean
        policy._mean_temporal_caps_loss = torch.mean(
            (action_mean_prev - action_mean_true)**2)

        # Add temporal smoothness loss to total loss
        total_loss += policy.config["caps_temporal_reg"] * \
            policy._mean_temporal_caps_loss

    if policy.config["caps_spatial_reg"] > 0.0 or \
            policy.config["symmetric_policy_reg"] > 0.0:
        # Generate noisy observation based on specified sensivity
        offset = 0
        observation_true = train_batch["obs"]
        observation_noisy = observation_true.clone()
        batch_dim = observation_true.shape[:-1]
        observation_space = policy.observation_space.original_space
        for scale in observation_space.sensitivity.values():
            scale = torch.from_numpy(scale.copy()).to(
                dtype=torch.float32, device=observation_true.device)
            unit_noise = torch.randn((*batch_dim, len(scale)),
                                     device=observation_true.device)
            slice_idx = slice(offset, offset + len(scale))
            observation_noisy[..., slice_idx].addcmul_(scale, unit_noise)
            offset += len(scale)

        # Compute the mean action corresponding to the noisy observation
        train_batch_copy["obs"] = observation_noisy
        action_logits_noisy, _ = model(train_batch_copy)
        if issubclass(dist_class, TorchDiagGaussian):
            action_mean_noisy, _ = torch.chunk(action_logits_noisy, 2, dim=1)
        else:
            action_dist_noisy = dist_class(action_logits_noisy, model)
            action_mean_noisy = action_dist_noisy.deterministic_sample()

    if policy.config["caps_spatial_reg"] > 0.0:
        # Minimize the difference between the original action mean and the
        # one corresponding to the noisy observation.
        policy._mean_spatial_caps_loss = torch.mean(
            (action_mean_noisy - action_mean_true)**2)

        # Add spatial smoothness loss to total loss
        total_loss += policy.config["caps_spatial_reg"] * \
            policy._mean_spatial_caps_loss

    if policy.config["caps_global_reg"] > 0.0:
        # Minimize the magnitude of action mean
        policy._mean_global_caps_loss = torch.mean(action_mean_true**2)

        # Add global smoothness loss to total loss
        total_loss += policy.config["caps_global_reg"] * \
            policy._mean_global_caps_loss

    if policy.config["symmetric_policy_reg"] > 0.0:
        # Compute mirrorred observation
        offset = 0
        observation_mirror = torch.empty_like(observation_true)
        observation_space = policy.observation_space.original_space
        for mirror_mat in observation_space.mirror_mat.values():
            mirror_mat = torch.from_numpy(mirror_mat.T.copy()).to(
                dtype=torch.float32, device=observation_true.device)
            slice_idx = slice(offset, offset + len(mirror_mat))
            torch.mm(observation_true[..., slice_idx],
                     mirror_mat,
                     out=observation_mirror[..., slice_idx])
            offset += len(mirror_mat)

        # Compute the mirrored mean action corresponding to the mirrored action
        train_batch_copy["obs"] = observation_mirror
        action_logits_mirror, _ = model(train_batch_copy)
        if issubclass(dist_class, TorchDiagGaussian):
            action_mean_mirror, _ = torch.chunk(action_logits_mirror, 2, dim=1)
        else:
            action_dist_mirror = dist_class(action_logits_mirror, model)
            action_mean_mirror = action_dist_mirror.deterministic_sample()
        action_mirror_mat = policy.action_space.mirror_mat
        action_mirror_mat = torch.from_numpy(action_mirror_mat.T.copy()).to(
            dtype=torch.float32, device=observation_true.device)
        action_mean_mirror = action_mean_mirror @ action_mirror_mat

        # Minimize the assymetry of policy output
        policy._mean_symmetric_policy_loss = torch.mean(
            (action_mean_mirror - action_mean_true)**2)

        # Add policy symmetry loss to total loss
        total_loss += policy.config["symmetric_policy_reg"] * \
            policy._mean_symmetric_policy_loss

    return total_loss