def test_convolution(dist, n_dims): assert dist.sample(1).shape == (1, n_dims) assert dist.sample(64).shape == (64, n_dims) try: dist.log_prob(dist.sample(64)) except NotImplementedError: pass dist.get_parameters()
def update(self, observations, actions, advantages, q_values=None): observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(advantages) # TODO: compute the loss that should be optimized when training with policy gradient # HINT1: Recall that the expression that we want to MAXIMIZE # is the expectation over collected trajectories of: # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] # HINT2: you will want to use the `log_prob` method on the distribution returned # by the `forward` method # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss distributions = self(observations) if self.discrete: probs = distributions.log_prob(actions) else: probs = distributions.log_prob(actions).sum(axis=-1) loss = -torch.sum(advantages * probs) # TODO: optimize `loss` using `self.optimizer` # HINT: remember to `zero_grad` first self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.nn_baseline: ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one ## HINT: there is a `normalize` function in `infrastructure.utils` targets = utils.normalize(q_values, q_values.mean(), q_values.std()) targets = ptu.from_numpy(targets) ## TODO: use the `forward` method of `self.baseline` to get baseline predictions baseline_predictions = self.baseline(observations).squeeze() ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape ## [ N ] versus shape [ N x 1 ] ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1 assert baseline_predictions.shape == targets.shape # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`) # HINT: use `F.mse_loss` baseline_loss = self.baseline_loss(baseline_predictions, targets) # TODO: optimize `baseline_loss` using `self.baseline_optimizer` # HINT: remember to `zero_grad` first self.baseline_optimizer.zero_grad() baseline_loss.backward() self.baseline_optimizer.step() train_log = { 'Training Loss': ptu.to_numpy(loss), } return train_log
def test_gumbel_softmax(dist, n_dims): assert dist.sample(1).shape == (1, n_dims) assert dist.log_prob(dist.sample(1)).shape == (1, ) samples = dist.sample(64) assert samples.shape == (64, n_dims) log_probs = dist.log_prob(samples) assert log_probs.shape == (64, ) dist.get_parameters() try: dist.entropy() except NotImplementedError: pass
def update(self, observations, actions, adv_n=None): # TODO: update the policy and return the loss observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) adv_n = ptu.from_numpy(adv_n) distributions = self(observations) if self.discrete: probs = distributions.log_prob(actions) else: probs = distributions.log_prob(actions).sum(axis=-1) loss = -torch.sum(adv_n*probs) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item()
def test_normal_dist(dist, n_dims): assert dist.sample(1).shape == (1, n_dims) assert dist.log_prob(dist.sample(1)).shape == (1, ) samples = dist.sample(64) assert samples.shape == (64, n_dims) log_probs = dist.log_prob(samples) assert log_probs.shape == (64, ) dist.get_parameters() dist.num_parameters try: dist.entropy() except NotImplementedError: pass try: dist.perplexity() except NotImplementedError: pass
def update(self, observations, actions, adv_n=None): # TODO: update the policy and return the loss # loss = TODO observations = ptu.from_numpy(observations) actions = ptu.from_numpy(actions) assert adv_n is not None, "Need non-null advantages to calculate loss!" advantages = ptu.from_numpy(adv_n) # Do a forward pass, and construct action distributions forward_pass = self.forward(observations) distributions = forward_pass # code above does this for me :) # Calculate the probability of actions taken, under calculated distributions log_prob_of_actions = torch.squeeze(distributions.log_prob(actions)) loss = torch.dot(log_prob_of_actions, advantages) loss *= -1 # because we want to maximize # TODO: optimize `loss` using `self.optimizer` # HINT: remember to `zero_grad` first self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.nn_baseline: ## done? TODO: normalize the q_values to have a mean of zero and a standard deviation of one ## HINT: there is a `normalize` function in `infrastructure.utils` targets = normalize(q_values, mean=0, std=1) targets = ptu.from_numpy(targets) ## done? TODO: use the `forward` method of `self.baseline` to get baseline predictions baseline_predictions = self.baseline.forward(observations) ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape ## [ N ] versus shape [ N x 1 ] ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1 baseline_predictions = torch.squeeze(baseline_predictions) assert baseline_predictions.shape == targets.shape, "not right shape!" # done? TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`) # HINT: use `F.mse_loss` # note: this is a guess as to what the loss could be baseline_loss = F.mse_loss(baseline_predictions, targets) # TODO: optimize `baseline_loss` using `self.baseline_optimizer` # HINT: remember to `zero_grad` first self.baseline_optimizer.zero_grad() baseline_loss.backward() self.baseline_optimizer.step() train_log = { 'Training Loss': ptu.to_numpy(loss), } return loss.item()
def test_dirac_delta(loc, n_dims): dist = DiracDelta(loc) assert dist.sample(1).shape == (1, n_dims), "{} {}".format( dist.sample(1).shape, n_dims) try: dist.log_prob(dist.sample(64)) except NotImplementedError: pass samples = dist.sample(64) assert samples.shape == (64, n_dims) if n_dims == 1: assert dist.get_parameters()['loc'] == 1.0 elif n_dims == 2: assert (dist.get_parameters()['loc'] == np.array([2.0, 3.0])).all() elif n_dims == 3: assert (dist.get_parameters()['loc'] == np.array([1.0, 2.0, 3.0])).all() else: raise ValueError() dist.get_parameters()
def test_data_dist(n_dims): data = torch.randn(1000, n_dims) dist = Data(data) assert dist.sample(1).shape == (1, n_dims) assert dist.sample(64).shape == (64, n_dims) try: dist.log_prob(dist.sample(64)) except NotImplementedError: pass assert dist.get_parameters()['n_dims'] == n_dims assert dist.get_parameters()['n_samples'] == 1000 data = np.random.randn(100, n_dims) dist = Data(data) assert dist.sample(1).shape == (1, n_dims) assert dist.sample(64).shape == (64, n_dims) assert dist.get_parameters()['n_dims'] == n_dims assert dist.get_parameters()['n_samples'] == 100