Example #1
0
 def predict_log_prob_batch(self, state, action):
     data_loader = create_data_loader((state, action),
                                      batch_size=32,
                                      shuffle=False,
                                      drop_last=False)
     log_probs = []
     for obs, action in data_loader:
         obs = move_tensor_to_gpu(obs)
         action = move_tensor_to_gpu(action)
         action_distribution = self.policy_net.forward_action(obs)
         log_probs.append(action_distribution.log_prob(action))
     log_probs = torch.cat(log_probs, dim=0).cpu().numpy()
     return log_probs
Example #2
0
    def fit(self,
            dataset: StateActionPairDataset,
            epoch=10,
            batch_size=128,
            verbose=False):
        t = range(epoch)
        if verbose:
            t = tqdm(t)

        train_data_loader, val_data_loader = dataset.random_iterator(
            batch_size=batch_size)

        for i in t:
            losses = []
            for history_states, history_actions, states, actions in train_data_loader:
                self.optimizer.zero_grad()
                history_states = move_tensor_to_gpu(history_states)
                history_actions = move_tensor_to_gpu(history_actions)
                states = move_tensor_to_gpu(states)
                actions = move_tensor_to_gpu(actions)

                history_states = (history_states -
                                  self.state_mean) / self.state_std
                states = (states - self.state_mean.squeeze(dim=1)
                          ) / self.state_std.squeeze(dim=1)

                output = self.model.forward(history_states, history_actions,
                                            states)
                loss = self.loss_fn(output, actions)
                loss.backward()
                self.optimizer.step()

                losses.append(loss.item())

            self.eval()
            val_losses = []
            with torch.no_grad():
                for history_states, history_actions, states, actions in val_data_loader:
                    history_states = move_tensor_to_gpu(history_states)
                    history_actions = move_tensor_to_gpu(history_actions)
                    states = move_tensor_to_gpu(states)
                    actions = move_tensor_to_gpu(actions)
                    history_states = (history_states -
                                      self.state_mean) / self.state_std
                    states = (states - self.state_mean.squeeze(dim=1)
                              ) / self.state_std.squeeze(dim=1)
                    output = self.model.forward(history_states,
                                                history_actions, states)
                    loss = self.loss_fn(output, actions)
                    val_losses.append(loss.item())

            self.train()

            if verbose:
                t.set_description(
                    'Epoch {}/{} - Avg policy train loss: {:.4f} - Avg policy val loss: {:.4f}'
                    .format(i + 1, epoch, np.mean(losses),
                            np.mean(val_losses)))
Example #3
0
    def compute_old_log_prob(self, observation, hidden, actions):
        with torch.no_grad():
            data_loader = create_data_loader((observation, hidden, actions),
                                             batch_size=32,
                                             shuffle=False,
                                             drop_last=False)
            old_log_prob = []
            for obs, hid, ac in data_loader:
                obs = move_tensor_to_gpu(obs)
                hid = move_tensor_to_gpu(hid)
                ac = move_tensor_to_gpu(ac)
                old_distribution, _, _ = self.policy_net.forward(obs, hid)
                old_log_prob.append(old_distribution.log_prob(ac))

            old_log_prob = torch.cat(old_log_prob, dim=0).cpu()
        return old_log_prob
Example #4
0
    def fit_dynamic_model(self,
                          dataset: Dataset,
                          epoch=10,
                          batch_size=128,
                          verbose=False):
        t = range(epoch)
        if verbose:
            t = tqdm(t)

        train_data_loader, val_data_loader = dataset.random_iterator(
            batch_size=batch_size)

        for i in t:
            losses = []
            for states, actions, next_states, _, _ in train_data_loader:
                # convert to tensor
                states = move_tensor_to_gpu(states)
                actions = move_tensor_to_gpu(actions)
                next_states = move_tensor_to_gpu(next_states)
                delta_states = next_states - states
                # calculate loss
                self.optimizer.zero_grad()
                predicted_delta_state_normalized = self.predict_normalized_delta_next_state(
                    states, actions)
                delta_states_normalized = normalize(delta_states,
                                                    self.delta_state_mean,
                                                    self.delta_state_std)
                loss = F.mse_loss(predicted_delta_state_normalized,
                                  delta_states_normalized)
                loss.backward()
                self.optimizer.step()
                losses.append(loss.item())

            self.eval()
            val_losses = []
            with torch.no_grad():
                for states, actions, next_states, _, _ in val_data_loader:
                    # convert to tensor
                    states = move_tensor_to_gpu(states)
                    actions = move_tensor_to_gpu(actions)
                    next_states = move_tensor_to_gpu(next_states)
                    delta_states = next_states - states
                    predicted_delta_state_normalized = self.predict_normalized_delta_next_state(
                        states, actions)
                    delta_states_normalized = normalize(
                        delta_states, self.delta_state_mean,
                        self.delta_state_std)
                    loss = F.mse_loss(predicted_delta_state_normalized,
                                      delta_states_normalized)
                    val_losses.append(loss.item())
            self.train()

            if verbose:
                t.set_description(
                    'Epoch {}/{} - Avg model train loss: {:.4f} - Avg model val loss: {:.4f}'
                    .format(i + 1, epoch, np.mean(losses),
                            np.mean(val_losses)))
Example #5
0
    def fit_dynamic_model(self,
                          dataset,
                          epoch=10,
                          batch_size=128,
                          verbose=False):
        t = range(epoch)
        if verbose:
            t = tqdm(t)

        train_data_loader, val_data_loader = dataset.random_iterator(
            batch_size=batch_size)

        for i in t:
            losses = []
            for states, actions, next_states, _, _ in train_data_loader:
                # convert to tensor
                states = move_tensor_to_gpu(states)
                actions = move_tensor_to_gpu(actions)
                next_states = move_tensor_to_gpu(next_states)

                latent_distribution = self.inference_network.forward(
                    next_states)

                z = latent_distribution.sample()
Example #6
0
    def update_policy(self, data_loader, epoch, logger):
        for epoch_index in range(epoch):
            for batch_sample in data_loader:

                # with torch.autograd.detect_anomaly():

                observation, action, discount_rewards, advantage, old_log_prob = move_tensor_to_gpu(
                    batch_sample)
                self.policy_optimizer.zero_grad()
                # update policy
                distribution, raw_baselines = self.policy_net.forward(
                    observation)
                entropy_loss = distribution.entropy().mean()
                log_prob = distribution.log_prob(action)

                assert log_prob.shape == advantage.shape, 'log_prob length {}, advantage length {}'.format(
                    log_prob.shape, advantage.shape)

                # if approximated kl is larger than 1.5 target_kl, we early stop training of this batch
                negative_approx_kl = log_prob - old_log_prob
                negative_approx_kl_mean = torch.mean(-negative_approx_kl)

                ratio = torch.exp(negative_approx_kl)
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
                                    1.0 + self.clip_param) * advantage
                policy_loss = -torch.min(surr1, surr2).mean()
                value_loss = self.baseline_loss(raw_baselines,
                                                discount_rewards)
                loss = policy_loss - entropy_loss * self.entropy_coef + value_loss * self.value_coef

                if negative_approx_kl_mean <= 1.5 * self.target_kl:
                    # print('Early stopping this iteration. Current kl {:.4f}. Current epoch index {}'.format(
                    #     negative_approx_kl_mean, epoch_index))
                    nn.utils.clip_grad_norm_(self.policy_net.parameters(),
                                             self.max_grad_norm)

                    loss.backward()
                    self.policy_optimizer.step()

                logger.store(PolicyLoss=policy_loss.item())
                logger.store(ValueLoss=value_loss.item())
                logger.store(EntropyLoss=entropy_loss.item())
                logger.store(NegativeAvgKL=negative_approx_kl_mean.item())
Example #7
0
    def predict_state_value_batch(self, state):
        """ compute the state value using nn baseline

        Args:
            state: (batch_size, ob_dim)

        Returns: (batch_size,)

        """
        data_loader = create_data_loader((state, ),
                                         batch_size=32,
                                         shuffle=False,
                                         drop_last=False)
        values = []
        for obs in data_loader:
            obs = move_tensor_to_gpu(obs[0])
            values.append(self.policy_net.forward_value(obs))
        values = torch.cat(values, dim=0).cpu().numpy()
        return values
Example #8
0
    def fit_dynamic_model(self,
                          dataset,
                          epoch=10,
                          batch_size=128,
                          logger=None):
        t = tqdm(range(epoch))

        train_data_loader, val_data_loader = dataset.random_iterator(
            batch_size=batch_size)

        for i in t:
            losses = []
            for states, actions, next_states, rewards, _ in train_data_loader:

                # in training, we drop last batch to avoid batch size 1 that may crash batch_norm layer.
                if states.shape[0] == 1:
                    continue

                # convert to tensor
                states = move_tensor_to_gpu(states)
                actions = move_tensor_to_gpu(actions)
                next_states = move_tensor_to_gpu(next_states)
                rewards = move_tensor_to_gpu(rewards)
                delta_states = next_states - states
                # calculate loss
                self.optimizer.zero_grad()
                predicted_delta_state_normalized, predicted_reward_normalized = \
                    self.predict_normalized_delta_next_state_reward(states, actions)
                delta_states_normalized = normalize(delta_states,
                                                    self.delta_state_mean,
                                                    self.delta_state_std)
                loss = F.mse_loss(predicted_delta_state_normalized,
                                  delta_states_normalized)
                if self.cost_fn_batch is None:
                    rewards_normalized = normalize(rewards, self.reward_mean,
                                                   self.reward_std)
                    loss += F.mse_loss(predicted_reward_normalized,
                                       rewards_normalized)
                loss.backward()
                self.optimizer.step()
                losses.append(loss.item())

            self.eval()
            val_losses = []
            with torch.no_grad():
                for states, actions, next_states, rewards, _ in val_data_loader:
                    # convert to tensor
                    states = move_tensor_to_gpu(states)
                    actions = move_tensor_to_gpu(actions)
                    next_states = move_tensor_to_gpu(next_states)
                    rewards = move_tensor_to_gpu(rewards)
                    delta_states = next_states - states
                    predicted_delta_state_normalized, predicted_reward_normalized = \
                        self.predict_normalized_delta_next_state_reward(states, actions)
                    delta_states_normalized = normalize(
                        delta_states, self.delta_state_mean,
                        self.delta_state_std)
                    loss = F.mse_loss(predicted_delta_state_normalized,
                                      delta_states_normalized)
                    if self.cost_fn_batch is None:
                        rewards_normalized = normalize(rewards,
                                                       self.reward_mean,
                                                       self.reward_std)
                        loss += F.mse_loss(predicted_reward_normalized,
                                           rewards_normalized)
                    val_losses.append(loss.item())
            self.train()

            if logger:
                logger.store(ModelTrainLoss=np.mean(losses))
                logger.store(ModelValLoss=np.mean(val_losses))

            t.set_description(
                'Epoch {}/{} - Avg model train loss: {:.4f} - Avg model val loss: {:.4f}'
                .format(i + 1, epoch, np.mean(losses), np.mean(val_losses)))
Example #9
0
    def train(self,
              num_epoch,
              train_data_loader,
              checkpoint_path=None,
              epoch_per_save=5,
              callbacks=(),
              summary_writer: SummaryWriter = None,
              verbose=True):
        n_iter = 0
        for epoch in range(num_epoch):
            self._set_to_train()
            negative_log_likelihood_train = 0.
            kl_divergence_train = 0.

            if verbose:
                t = tqdm(train_data_loader,
                         desc='Epoch {}/{}'.format(epoch + 1, num_epoch))
            else:
                t = train_data_loader

            for data_batch in t:
                input = data_batch[0]
                self.optimizer.zero_grad()
                input = move_tensor_to_gpu(input)
                latent_distribution = self.encode(input)
                z = latent_distribution.rsample()
                out = self.decode_distribution(z)

                negative_log_likelihood = -out.log_prob(input).sum()
                kl_divergence = torch.distributions.kl_divergence(
                    latent_distribution, self.prior).sum()

                loss = negative_log_likelihood + kl_divergence
                loss.backward()
                self.optimizer.step()

                negative_log_likelihood_train += negative_log_likelihood.item()
                kl_divergence_train += kl_divergence.item()

                if summary_writer:
                    summary_writer.add_scalar('data/nll',
                                              negative_log_likelihood.item(),
                                              n_iter)
                    summary_writer.add_scalar('data/kld', kl_divergence.item(),
                                              n_iter)

                n_iter += 1

            if verbose:
                num_dimensions = np.prod(
                    list(train_data_loader.dataset[0][0].shape))
                negative_log_likelihood_train /= len(train_data_loader.dataset)
                negative_log_likelihood_train_bits_per_dim = log_to_log2(
                    negative_log_likelihood_train / num_dimensions)
                kl_divergence_train /= len(train_data_loader.dataset)
                kl_divergence_train_bits_per_dim = log_to_log2(
                    kl_divergence_train / num_dimensions)
                total_loss = negative_log_likelihood_train + kl_divergence_train
                total_loss_bits_per_dim = log_to_log2(total_loss /
                                                      num_dimensions)

                total_loss_message = 'Totol loss {:.4f}/{:.4f} (bits/dim)'.format(
                    total_loss, total_loss_bits_per_dim)
                nll_message = 'Negative log likelihood {:.4f}/{:.4f} (bits/dim)'.format(
                    negative_log_likelihood_train,
                    negative_log_likelihood_train_bits_per_dim)
                kl_message = 'KL divergence {:.4f}/{:.4f} (bits/dim)'.format(
                    kl_divergence_train, kl_divergence_train_bits_per_dim)

                print(' - '.join([total_loss_message, nll_message,
                                  kl_message]))

            if checkpoint_path is not None and (epoch +
                                                1) % epoch_per_save == 0:
                self.save_checkpoint(checkpoint_path)

            if summary_writer:
                for callback in callbacks:
                    callback(epoch, self, summary_writer)
        if checkpoint_path is not None:
            self.save_checkpoint(checkpoint_path)
Example #10
0
    def train_on_data_loader(self, train_data_loader, verbose=True):
        """ Train on data_loader for one epoch

        Args:
            train_data_loader: training data loader
            verbose: verbose mode or not

        Returns: training loss list at each step

        """
        self.model.train()
        if verbose:
            t = tqdm(train_data_loader)
        else:
            t = train_data_loader
        train_loss = []
        for data_label in t:
            data, labels = data_label
            data = move_tensor_to_gpu(data)
            labels = move_tensor_to_gpu(labels)

            if not isinstance(labels, list):
                labels = [labels]

            self.optimizer.zero_grad()

            # for compatibility with singular data and labels
            if isinstance(data, list):
                outputs = self.model(*data)
            else:
                outputs = self.model(data)

            if not isinstance(outputs, tuple):
                outputs = [outputs]

            current_loss = []

            for j in range(len(outputs)):
                loss = self.loss[j](outputs[j], labels[j])
                if self.loss_weights is not None:
                    loss = loss * self.loss_weights[j]
                current_loss.append(loss)

            loss = sum(current_loss)

            loss.backward()
            self.optimizer.step()
            # gather training statistics
            if verbose:
                stats_str = []
                stats_str.append('Train loss: {:.4f}'.format(loss.item()))

                stats = self._compute_metrics(outputs, labels)
                for i, stat in enumerate(stats):
                    for metric, result in stat.items():
                        stats_str.append('Output {} {}: {:.4f}'.format(i, metric, result))

                training_description = " - ".join(stats_str)
                # set log for each batch
                t.set_description(training_description)

            train_loss.append(loss.item())
        return train_loss
Example #11
0
    def evaluate(self, data_loader, desc=None):
        self.model.eval()
        with torch.no_grad():
            total_loss = 0.0
            total = 0

            all_outputs = []
            all_labels = []

            for data_label in tqdm(data_loader, desc=desc):
                data, labels = data_label
                data = move_tensor_to_gpu(data)
                labels = move_tensor_to_gpu(labels)

                if not isinstance(labels, list):
                    labels = [labels]

                if len(all_labels) == 0:
                    for label in labels:
                        all_labels.append([label])
                else:
                    for i, label in enumerate(labels):
                        all_labels[i].append(label)

                if isinstance(data, list):
                    outputs = self.model(*data)
                else:
                    outputs = self.model(data)

                if not isinstance(outputs, tuple):
                    outputs = [outputs]

                if len(all_outputs) == 0:
                    for output in outputs:
                        all_outputs.append([output])
                else:
                    for i, output in enumerate(outputs):
                        all_outputs[i].append(output)

                current_loss = []

                for j in range(len(outputs)):
                    loss = self.loss[j](outputs[j], labels[j])
                    if self.loss_weights is not None:
                        loss = loss * self.loss_weights[j]
                    current_loss.append(loss)

                loss = sum(current_loss)

                # calculate stats
                total_loss += loss.item() * labels[0].size(0)
                total += labels[0].size(0)

            for i, output in enumerate(all_outputs):
                all_outputs[i] = torch.cat(output, dim=0)

            for i, label in enumerate(all_labels):
                all_labels[i] = torch.cat(label, dim=0)

            loss = total_loss / total
            stats = self._compute_metrics(all_outputs, all_labels)

            return loss, stats
Example #12
0
def compute_reward_to_go_gae(paths, gamma, policy_net, lam, value_mean,
                             value_std):
    rewards = []
    gaes = []
    for path in paths:
        # compute last state value
        if path['mask'][-1] == 1:
            with torch.no_grad():
                last_obs = convert_numpy_to_tensor(
                    np.expand_dims(path['last_obs'], axis=0)).type(FloatTensor)
                last_hidden = convert_numpy_to_tensor(
                    np.expand_dims(path['last_hidden'],
                                   axis=0)).type(FloatTensor)
                last_state_value = policy_net.forward(
                    last_obs, last_hidden)[-1].cpu().numpy()[0]
                last_state_value = last_state_value * value_std + value_mean
        else:
            last_state_value = 0.

        # we need to clip last_state_value by (max_abs_value / (1 - gamma))
        # Otherwise, large state value would cause positive feedback loop and cause the reward to explode.
        max_abs_value = np.max(np.abs(path['reward']))
        last_state_value = np.clip(last_state_value,
                                   a_min=-max_abs_value / (1 - gamma),
                                   a_max=max_abs_value / (1 - gamma))

        # calculate reward-to-go
        path['reward'].append(last_state_value)
        current_rewards = discount(path['reward'], gamma).astype(np.float32)

        rewards.append(current_rewards[:-1])

        # compute gae
        with torch.no_grad():
            observation = path['observation']
            hidden = path['hidden']
            data_loader = create_data_loader((observation, hidden),
                                             batch_size=32,
                                             shuffle=False,
                                             drop_last=False)
            values = []
            for obs, hid in data_loader:
                obs = move_tensor_to_gpu(obs)
                hid = move_tensor_to_gpu(hid)
                values.append(policy_net.forward(obs, hid)[-1])
            values = torch.cat(values, dim=0).cpu().numpy()
            values = values * value_std + value_mean
            values = np.append(values, last_state_value)

        # add the value of last obs for truncated trajectory
        temporal_difference = path[
            'reward'][:-1] + values[1:] * gamma - values[:-1]
        # calculate reward-to-go
        gae = discount(temporal_difference, gamma * lam).astype(np.float32)
        gaes.append(gae)

    rewards = np.concatenate(rewards)
    new_values_mean, new_values_std = np.mean(rewards), np.std(rewards)
    rewards = (rewards - new_values_mean) / (new_values_std + eps)

    gaes = np.concatenate(gaes)
    gaes = (gaes - np.mean(gaes)) / (np.std(gaes) + eps)

    return rewards, gaes, new_values_mean, new_values_std
Example #13
0
    def update_policy(self, dataset, epoch=4):
        # construct a dataset using paths containing (action, observation, old_log_prob)
        if self.recurrent:
            data_loader = create_data_loader(dataset,
                                             batch_size=128,
                                             shuffle=False,
                                             drop_last=False)
        else:
            data_loader = create_data_loader(dataset,
                                             batch_size=128,
                                             shuffle=True,
                                             drop_last=False)

        for epoch_index in range(epoch):
            current_hidden = torch.tensor(
                np.expand_dims(self.init_hidden_unit, axis=0),
                requires_grad=False).type(FloatTensor)
            for batch_sample in data_loader:
                action, advantage, observation, discount_rewards, old_log_prob, mask = \
                    move_tensor_to_gpu(batch_sample)

                self.policy_optimizer.zero_grad()
                # update policy
                if not self.recurrent:
                    distribution, _, raw_baselines = self.policy_net.forward(
                        observation, None)
                    entropy_loss = distribution.entropy().mean()
                    log_prob = distribution.log_prob(action)
                else:
                    entropy_loss = []
                    log_prob = []
                    raw_baselines = []
                    zero_index = np.where(mask == 0)[0] + 1
                    zero_index = zero_index.tolist()
                    zero_index.insert(0, 0)

                    for i in range(len(zero_index) - 1):
                        start_index = zero_index[i]
                        end_index = zero_index[i + 1]
                        current_obs = observation[start_index:end_index]
                        current_actions = action[start_index:end_index]
                        current_dist, _, current_baseline = self.policy_net.forward(
                            current_obs, current_hidden)
                        current_hidden = torch.tensor(
                            np.expand_dims(self.init_hidden_unit, axis=0),
                            requires_grad=False).type(FloatTensor)
                        current_log_prob = current_dist.log_prob(
                            current_actions)

                        log_prob.append(current_log_prob)
                        raw_baselines.append(current_baseline)
                        entropy_loss.append(current_dist.entropy())

                    # last iteration
                    start_index = zero_index[-1]
                    if start_index < observation.shape[0]:
                        current_obs = observation[start_index:]
                        current_actions = action[start_index:]
                        current_dist, current_hidden, current_baseline = self.policy_net.forward(
                            current_obs, current_hidden)

                        current_log_prob = current_dist.log_prob(
                            current_actions)

                        log_prob.append(current_log_prob)
                        raw_baselines.append(current_baseline)
                        entropy_loss.append(current_dist.entropy())
                        current_hidden = current_hidden.detach()

                    log_prob = torch.cat(log_prob, dim=0)
                    raw_baselines = torch.cat(raw_baselines, dim=0)
                    entropy_loss = torch.cat(entropy_loss, dim=0).mean()

                assert log_prob.shape == advantage.shape, 'log_prob length {}, advantage length {}'.format(
                    log_prob.shape, advantage.shape)

                # if approximated kl is larger than 1.5 target_kl, we early stop training of this batch
                negative_approx_kl = log_prob - old_log_prob

                negative_approx_kl_mean = torch.mean(
                    -negative_approx_kl).item()

                if negative_approx_kl_mean > 1.5 * self.target_kl:
                    # print('Early stopping this iteration. Current kl {:.4f}. Current epoch index {}'.format(
                    #     negative_approx_kl_mean, epoch_index))
                    continue

                ratio = torch.exp(negative_approx_kl)
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
                                    1.0 + self.clip_param) * advantage
                policy_loss = -torch.min(surr1, surr2).mean()

                value_loss = self.get_baseline_loss(raw_baselines,
                                                    discount_rewards)

                loss = policy_loss - entropy_loss * self.entropy_coef + self.value_coef * value_loss

                nn.utils.clip_grad_norm_(self.policy_net.parameters(),
                                         self.max_grad_norm)

                loss.backward()
                self.policy_optimizer.step()