def act_vectorized(self, obs, goal, horizon=None, greedy=False, noise=0, marginal_policy=None): obs = torch.tensor(obs, dtype=torch.float32) goal = torch.tensor(goal, dtype=torch.float32) if horizon is not None: horizon = torch.tensor(horizon, dtype=torch.float32) logits = self.forward(obs, goal, horizon=horizon) logits = logits.view(-1, self.n_dims, self.granularity) noisy_logits = logits * (1 - noise) probs = torch.softmax(noisy_logits, 2) if greedy: samples = torch.argmax(probs, dim=-1) else: samples = torch.distributions.categorical.Categorical( probs=probs).sample() samples = self.flattened(samples) if greedy: samples = ptu.to_numpy(samples) random_samples = np.random.choice(self.action_space.n, size=len(samples)) return np.where( np.random.rand(len(samples)) < noise, random_samples, samples, ) return ptu.to_numpy(samples)
def act_vectorized(self, obs, goal, horizon=None, greedy=False, noise=0, marginal_policy=None): obs = torch.tensor(obs, dtype=torch.float32) goal = torch.tensor(goal, dtype=torch.float32) if horizon is not None: horizon = torch.tensor(horizon, dtype=torch.float32) logits = self.forward(obs, goal, horizon=horizon) if marginal_policy is not None: dummy_goal = torch.zeros_like(goal) marginal_logits = marginal_policy.forward(obs, dummy_goal, horizon) logits -= marginal_logits noisy_logits = logits * (1 - noise) probs = torch.softmax(noisy_logits, 1) if greedy: samples = torch.argmax(probs, dim=-1) else: samples = torch.distributions.categorical.Categorical( probs=probs).sample() return ptu.to_numpy(samples)
def entropy(self, obs, goal, horizon=None): logits = self.forward(obs, goal, horizon=horizon) logits = logits.view(-1, self.n_dims, self.granularity) probs = torch.softmax(noisy_logits, 2) Z = torch.logsumexp(logits, dim=2) return (Z - torch.sum(probs * logits, 2)).sum(1)
def entropy(self, obs, goal, horizon=None): logits = self.forward(obs, goal, horizon=horizon) probs = torch.softmax(logits, 1) Z = torch.logsumexp(logits, dim=1) return Z - torch.sum(probs * logits, 1)
def probabilities(self, obs, goal, horizon=None): logits = self.forward(obs, goal, horizon=horizon) probs = torch.softmax(logits, 1) return probs