def __call__(self, pred, device='cpu'): pred = torch.rand(size=(3, 5, 4)) # agent, states, actions idx = torch.randint(low=0, high=pred.shape[0], size=pred.shape[1:]).to(pred.device) idx_ohe = one_hot_encoding(idx, n_categories=pred.shape[0], unsqueeze=True) return (pred * torch.permute(idx_ohe, [2, 0, 1])).sum(0)
def fit_epoch(self, device, verbose=1): self.model.train() self.model.to(device) losses = [] # for batch, (action, state, reward, new_state, terminal) in tqdm(enumerate(self.train_loader)): for batch, (action, state, reward, new_state, terminal) in tqdm(enumerate(self.train_loader)): action, state, reward, new_state = action.to( self.device), state.to(self.device), reward.to( self.device), new_state.to(self.device) prediction = self.model(state.squeeze(1)) target = prediction.clone().detach() max_next = self.get_max_Q_for_states(new_state) mask = one_hot_encoding(action, n_categories=self.env.action_space.n).type( torch.BoolTensor).to(self.device) target[mask] = (1 - self.alpha) * target[mask] + self.alpha * ( reward.view(-1) + self.gamma * max_next * (1 - terminal.view(-1).type(torch.FloatTensor)).to(self.device) ) loss = self.crit(prediction, target) losses += [loss.item()] self._backward(loss) self.train_dict['train_losses'] += [np.mean(losses).item()] # This is using the DQL loss defined in 'Asynchronous Methods for Deep Reinforcement Learning' by Mnih et al., # though this is not working # for batch, (action, state, reward, new_state, terminal) in tqdm(enumerate(self.train_loader)): # action, state, reward, new_state = action.to(self.device), state.to(self.device), reward.to( # self.device), new_state.to(self.device) # prediction = self.model(state.squeeze(1)) # max_next = self.get_max_Q_for_states(new_state) # mask = one_hot_encoding(action, n_categories=self.env.action_space.n).type(torch.BoolTensor) # # loss = self.crit(gamma=self.gamma, pred=prediction[mask], max_next=max_next, reward=reward) # self.train_dict['train_losses'] += [loss.item()] # self._backward(loss) if verbose == 1: print( f'epoch: {self.train_dict["epochs_run"]}\t' f'average reward: {np.mean(self.train_dict["rewards"]):.2f}\t', f'last loss: {self.train_dict["train_losses"][-1]:.2f}', f'latest average reward: {self.train_dict.get("avg_reward", [np.nan])[-1]:.2f}' ) return loss
def fit_epoch(self, device, verbose=1): self.model.train() self.model.to(device) self.target_model.to(device) for batch, (action, state, reward, next_state, next_action, terminal) in tqdm(enumerate(self.train_loader)): action, state, reward, next_state = action.to( self.device), state.to(self.device), reward.to( self.device), next_state.to(self.device) prediction = self.model(state.squeeze(1)) next_action = one_hot_encoding(next_action).to(self.device) with eval_mode(self): # @todo this is not working with DDQN so far next_Q = (self.target_model(next_state.squeeze(1)) * next_action).sum(1) target = prediction.clone().detach() mask = one_hot_encoding(action).type(torch.BoolTensor) target[mask] = (1 - self.alpha) * target[mask] + self.alpha * ( reward + self.gamma * next_Q * (1. - terminal.type(torch.FloatTensor)).to(self.device)) loss = self.crit(prediction, target) self.train_dict['train_losses'] += [loss.item()] self._backward(loss) self.update_target_network() if verbose == 1: print( f'epoch: {self.train_dict["epochs_run"]}\t' f'average reward: {np.mean(self.train_dict["rewards"]):.2f}\t' f'latest average reward: {self.train_dict["avg_reward"][-1]:.2f}' ) return loss
def __call__(self, pred, device='cpu'): """ Args: pred: probability distribution over classes by each model of the ensemble device: not used Returns: torch tensor of entropy for label distribution over the models """ actions = pred.max(-1)[1] ohe_actions = one_hot_encoding(actions, n_categories=4, unsqueeze=True) action_dist = ohe_actions.mean(0) action_entropy = entropy(torch.transpose(action_dist, 0, 1)) return super(EntropyHat, self).__call__(pred), torch.tensor(action_entropy).view( -1, 1)
def fit_epoch(self, device, verbose=1): self.model.train() self.model.to(device) losses = [] for batch, (action, state, reward, new_state, terminal) in tqdm(enumerate(self.train_loader)): action, state, reward, new_state = action.to( self.device), state.to(self.device), reward.to( self.device), new_state.to(self.device) values, advantage = self.model(state.squeeze(1)) prediction = values + advantage target = prediction.clone().detach() max_next = self.get_max_Q_for_states(new_state) mask = one_hot_encoding(action, n_categories=self.env.action_space.n).type( torch.BoolTensor).to(self.device) target[mask] = (1 - self.alpha) * target[mask] + self.alpha * ( reward.view(-1) + self.gamma * max_next * (1 - terminal.view(-1).type(torch.FloatTensor)).to(self.device) ) loss = self.crit(prediction, target) losses += [loss.item()] self._backward(loss) self.train_dict['train_losses'] += [np.mean(losses).item()] if verbose == 1: print( f'epoch: {self.train_dict["epochs_run"]}\t' f'average reward: {np.mean(self.train_dict["rewards"]):.2f}\t', f'last loss: {self.train_dict["train_losses"][-1]:.2f}', f'latest average reward: {self.train_dict.get("avg_reward", [np.nan])[-1]:.2f}' ) return loss
'deter_val_reward_std': np.std }, frequency=1, epoch_name='det_val_epoch'), rcb.EnvironmentEvaluator( env=TorchGym(**params['factory_args']['env_args']), n_evaluations=10, action_selector=sp.QActionSelection(temperature=params['temp'], post_pipeline=[EnsembleHat()]), metrics={ 'prob_val_reward_mean': np.mean, 'prob_val_reward_std': np.std }, frequency=1, epoch_name='prob_val_epoch'), rcb.EnsembleRewardPlotter( metrics={ 'det_val_reward_mean': 'det_val_epoch', 'prob_val_reward_mean': 'prob_val_epoch', }), ]) learner.fit(**params['fit']) pred = learner(memory.memory['state']) pred.shape actions = pred.max(-1)[1] ohe_actions = one_hot_encoding(actions, n_categories=4, unsqueeze=True) action_probs = ohe_actions.mean(0) action_entropy = entropy(torch.transpose(action_probs, 0, 1)) pred[:, 0]
from pymatch.utils.functional import one_hot_encoding import torch test_array = torch.tensor([0,1,2,3]) encoding = one_hot_encoding(test_array) assert (torch.eye(4) == encoding).type(torch.float).mean() == 1. test_array = torch.tensor([0,1,2,3]) encoding = one_hot_encoding(test_array, n_categories=4) assert (torch.eye(4) == encoding).type(torch.float).mean() == 1. test_array = torch.tensor([0, 1, 2, 2]) encoding = one_hot_encoding(test_array, n_categories=4) true_values = torch.eye(4) true_values[-1, -2:] = torch.tensor([1, 0]) assert (true_values == encoding).type(torch.float).mean() == 1.