Example #1
0
 def __call__(self, pred, device='cpu'):
     pred = torch.rand(size=(3, 5, 4))  # agent, states, actions
     idx = torch.randint(low=0, high=pred.shape[0],
                         size=pred.shape[1:]).to(pred.device)
     idx_ohe = one_hot_encoding(idx,
                                n_categories=pred.shape[0],
                                unsqueeze=True)
     return (pred * torch.permute(idx_ohe, [2, 0, 1])).sum(0)
Example #2
0
    def fit_epoch(self, device, verbose=1):
        self.model.train()
        self.model.to(device)

        losses = []

        # for batch, (action, state, reward, new_state, terminal) in tqdm(enumerate(self.train_loader)):
        for batch, (action, state, reward, new_state,
                    terminal) in tqdm(enumerate(self.train_loader)):
            action, state, reward, new_state = action.to(
                self.device), state.to(self.device), reward.to(
                    self.device), new_state.to(self.device)
            prediction = self.model(state.squeeze(1))
            target = prediction.clone().detach()
            max_next = self.get_max_Q_for_states(new_state)

            mask = one_hot_encoding(action,
                                    n_categories=self.env.action_space.n).type(
                                        torch.BoolTensor).to(self.device)
            target[mask] = (1 - self.alpha) * target[mask] + self.alpha * (
                reward.view(-1) + self.gamma * max_next *
                (1 - terminal.view(-1).type(torch.FloatTensor)).to(self.device)
            )

            loss = self.crit(prediction, target)
            losses += [loss.item()]
            self._backward(loss)

        self.train_dict['train_losses'] += [np.mean(losses).item()]

        # This is using the DQL loss defined in 'Asynchronous Methods for Deep Reinforcement Learning' by Mnih et al.,
        # though this is not working
        # for batch, (action, state, reward, new_state, terminal) in tqdm(enumerate(self.train_loader)):
        #     action, state, reward, new_state = action.to(self.device), state.to(self.device), reward.to(
        #         self.device), new_state.to(self.device)
        #     prediction = self.model(state.squeeze(1))
        #     max_next = self.get_max_Q_for_states(new_state)
        #     mask = one_hot_encoding(action, n_categories=self.env.action_space.n).type(torch.BoolTensor)
        #
        #     loss = self.crit(gamma=self.gamma, pred=prediction[mask], max_next=max_next, reward=reward)
        #     self.train_dict['train_losses'] += [loss.item()]
        #     self._backward(loss)

        if verbose == 1:
            print(
                f'epoch: {self.train_dict["epochs_run"]}\t'
                f'average reward: {np.mean(self.train_dict["rewards"]):.2f}\t',
                f'last loss: {self.train_dict["train_losses"][-1]:.2f}',
                f'latest average reward: {self.train_dict.get("avg_reward", [np.nan])[-1]:.2f}'
            )
        return loss
Example #3
0
    def fit_epoch(self, device, verbose=1):
        self.model.train()
        self.model.to(device)
        self.target_model.to(device)

        for batch, (action, state, reward, next_state, next_action,
                    terminal) in tqdm(enumerate(self.train_loader)):
            action, state, reward, next_state = action.to(
                self.device), state.to(self.device), reward.to(
                    self.device), next_state.to(self.device)
            prediction = self.model(state.squeeze(1))
            next_action = one_hot_encoding(next_action).to(self.device)
            with eval_mode(self):  # @todo this is not working with DDQN so far
                next_Q = (self.target_model(next_state.squeeze(1)) *
                          next_action).sum(1)
            target = prediction.clone().detach()

            mask = one_hot_encoding(action).type(torch.BoolTensor)
            target[mask] = (1 - self.alpha) * target[mask] + self.alpha * (
                reward + self.gamma * next_Q *
                (1. - terminal.type(torch.FloatTensor)).to(self.device))

            loss = self.crit(prediction, target)
            self.train_dict['train_losses'] += [loss.item()]
            self._backward(loss)

        self.update_target_network()

        if verbose == 1:
            print(
                f'epoch: {self.train_dict["epochs_run"]}\t'
                f'average reward: {np.mean(self.train_dict["rewards"]):.2f}\t'
                f'latest average reward: {self.train_dict["avg_reward"][-1]:.2f}'
            )

        return loss
Example #4
0
    def __call__(self, pred, device='cpu'):
        """

        Args:
            pred:       probability distribution over classes by each model of the ensemble
            device:     not used

        Returns:
            torch tensor of entropy for label distribution over the models
        """
        actions = pred.max(-1)[1]
        ohe_actions = one_hot_encoding(actions, n_categories=4, unsqueeze=True)
        action_dist = ohe_actions.mean(0)
        action_entropy = entropy(torch.transpose(action_dist, 0, 1))

        return super(EntropyHat,
                     self).__call__(pred), torch.tensor(action_entropy).view(
                         -1, 1)
Example #5
0
    def fit_epoch(self, device, verbose=1):
        self.model.train()
        self.model.to(device)

        losses = []

        for batch, (action, state, reward, new_state,
                    terminal) in tqdm(enumerate(self.train_loader)):
            action, state, reward, new_state = action.to(
                self.device), state.to(self.device), reward.to(
                    self.device), new_state.to(self.device)
            values, advantage = self.model(state.squeeze(1))
            prediction = values + advantage
            target = prediction.clone().detach()
            max_next = self.get_max_Q_for_states(new_state)

            mask = one_hot_encoding(action,
                                    n_categories=self.env.action_space.n).type(
                                        torch.BoolTensor).to(self.device)
            target[mask] = (1 - self.alpha) * target[mask] + self.alpha * (
                reward.view(-1) + self.gamma * max_next *
                (1 - terminal.view(-1).type(torch.FloatTensor)).to(self.device)
            )

            loss = self.crit(prediction, target)
            losses += [loss.item()]
            self._backward(loss)

        self.train_dict['train_losses'] += [np.mean(losses).item()]

        if verbose == 1:
            print(
                f'epoch: {self.train_dict["epochs_run"]}\t'
                f'average reward: {np.mean(self.train_dict["rewards"]):.2f}\t',
                f'last loss: {self.train_dict["train_losses"][-1]:.2f}',
                f'latest average reward: {self.train_dict.get("avg_reward", [np.nan])[-1]:.2f}'
            )
        return loss
Example #6
0
                'deter_val_reward_std': np.std
            },
            frequency=1,
            epoch_name='det_val_epoch'),
        rcb.EnvironmentEvaluator(
            env=TorchGym(**params['factory_args']['env_args']),
            n_evaluations=10,
            action_selector=sp.QActionSelection(temperature=params['temp'],
                                                post_pipeline=[EnsembleHat()]),
            metrics={
                'prob_val_reward_mean': np.mean,
                'prob_val_reward_std': np.std
            },
            frequency=1,
            epoch_name='prob_val_epoch'),
        rcb.EnsembleRewardPlotter(
            metrics={
                'det_val_reward_mean': 'det_val_epoch',
                'prob_val_reward_mean': 'prob_val_epoch',
            }),
    ])

learner.fit(**params['fit'])

pred = learner(memory.memory['state'])
pred.shape
actions = pred.max(-1)[1]
ohe_actions = one_hot_encoding(actions, n_categories=4, unsqueeze=True)
action_probs = ohe_actions.mean(0)
action_entropy = entropy(torch.transpose(action_probs, 0, 1))
pred[:, 0]
Example #7
0
from pymatch.utils.functional import one_hot_encoding
import torch


test_array = torch.tensor([0,1,2,3])
encoding = one_hot_encoding(test_array)
assert (torch.eye(4) == encoding).type(torch.float).mean() == 1.

test_array = torch.tensor([0,1,2,3])
encoding = one_hot_encoding(test_array, n_categories=4)
assert (torch.eye(4) == encoding).type(torch.float).mean() == 1.

test_array = torch.tensor([0, 1, 2, 2])
encoding = one_hot_encoding(test_array, n_categories=4)
true_values = torch.eye(4)
true_values[-1, -2:] = torch.tensor([1, 0])
assert (true_values == encoding).type(torch.float).mean() == 1.