コード例 #1
0
def test_convolution(dist, n_dims):
    assert dist.sample(1).shape == (1, n_dims)
    assert dist.sample(64).shape == (64, n_dims)

    try:
        dist.log_prob(dist.sample(64))
    except NotImplementedError:
        pass

    dist.get_parameters()
コード例 #2
0
    def update(self, observations, actions, advantages, q_values=None):
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        advantages = ptu.from_numpy(advantages)

        # TODO: compute the loss that should be optimized when training with policy gradient
        # HINT1: Recall that the expression that we want to MAXIMIZE
        # is the expectation over collected trajectories of:
        # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # HINT2: you will want to use the `log_prob` method on the distribution returned
        # by the `forward` method
        # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss
        distributions = self(observations)
        if self.discrete:
            probs = distributions.log_prob(actions)
        else:
            probs = distributions.log_prob(actions).sum(axis=-1)
        loss = -torch.sum(advantages * probs)

        # TODO: optimize `loss` using `self.optimizer`
        # HINT: remember to `zero_grad` first

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.nn_baseline:
            ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one
            ## HINT: there is a `normalize` function in `infrastructure.utils`
            targets = utils.normalize(q_values, q_values.mean(),
                                      q_values.std())
            targets = ptu.from_numpy(targets)

            ## TODO: use the `forward` method of `self.baseline` to get baseline predictions
            baseline_predictions = self.baseline(observations).squeeze()

            ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape
            ## [ N ] versus shape [ N x 1 ]
            ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1
            assert baseline_predictions.shape == targets.shape

            # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`)
            # HINT: use `F.mse_loss`
            baseline_loss = self.baseline_loss(baseline_predictions, targets)

            # TODO: optimize `baseline_loss` using `self.baseline_optimizer`
            # HINT: remember to `zero_grad` first
            self.baseline_optimizer.zero_grad()
            baseline_loss.backward()
            self.baseline_optimizer.step()

        train_log = {
            'Training Loss': ptu.to_numpy(loss),
        }
        return train_log
コード例 #3
0
def test_gumbel_softmax(dist, n_dims):
    assert dist.sample(1).shape == (1, n_dims)
    assert dist.log_prob(dist.sample(1)).shape == (1, )

    samples = dist.sample(64)
    assert samples.shape == (64, n_dims)

    log_probs = dist.log_prob(samples)
    assert log_probs.shape == (64, )

    dist.get_parameters()
    try:
        dist.entropy()
    except NotImplementedError:
        pass
コード例 #4
0
    def update(self, observations, actions, adv_n=None):
        # TODO: update the policy and return the loss

        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        adv_n = ptu.from_numpy(adv_n)

        distributions = self(observations)
        if self.discrete:
            probs = distributions.log_prob(actions)
        else:
            probs = distributions.log_prob(actions).sum(axis=-1)
        loss = -torch.sum(adv_n*probs)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()
コード例 #5
0
def test_normal_dist(dist, n_dims):
    assert dist.sample(1).shape == (1, n_dims)
    assert dist.log_prob(dist.sample(1)).shape == (1, )

    samples = dist.sample(64)
    assert samples.shape == (64, n_dims)

    log_probs = dist.log_prob(samples)
    assert log_probs.shape == (64, )

    dist.get_parameters()
    dist.num_parameters
    try:
        dist.entropy()
    except NotImplementedError:
        pass
    try:
        dist.perplexity()
    except NotImplementedError:
        pass
コード例 #6
0
ファイル: MLP_policy.py プロジェクト: zacharyjgs/covid-rl
    def update(self, observations, actions, adv_n=None):
        # TODO: update the policy and return the loss
        # loss = TODO
        observations = ptu.from_numpy(observations)
        actions = ptu.from_numpy(actions)
        assert adv_n is not None, "Need non-null advantages to calculate loss!"
        advantages = ptu.from_numpy(adv_n)

        # Do a forward pass, and construct action distributions
        forward_pass = self.forward(observations)
        distributions = forward_pass  # code above does this for me :)

        # Calculate the probability of actions taken, under calculated distributions
        log_prob_of_actions = torch.squeeze(distributions.log_prob(actions))
        loss = torch.dot(log_prob_of_actions, advantages)
        loss *= -1  # because we want to maximize

        # TODO: optimize `loss` using `self.optimizer`
        # HINT: remember to `zero_grad` first
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.nn_baseline:
            ## done? TODO: normalize the q_values to have a mean of zero and a standard deviation of one
            ## HINT: there is a `normalize` function in `infrastructure.utils`
            targets = normalize(q_values, mean=0, std=1)
            targets = ptu.from_numpy(targets)

            ## done? TODO: use the `forward` method of `self.baseline` to get baseline predictions
            baseline_predictions = self.baseline.forward(observations)
            
            ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape
            ## [ N ] versus shape [ N x 1 ]
            ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1
            baseline_predictions = torch.squeeze(baseline_predictions)
            assert baseline_predictions.shape == targets.shape, "not right shape!"
            
            # done? TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`)
            # HINT: use `F.mse_loss`
            # note: this is a guess as to what the loss could be
            baseline_loss = F.mse_loss(baseline_predictions, targets)

            # TODO: optimize `baseline_loss` using `self.baseline_optimizer`
            # HINT: remember to `zero_grad` first
            self.baseline_optimizer.zero_grad()
            baseline_loss.backward()
            self.baseline_optimizer.step()

        train_log = {
            'Training Loss': ptu.to_numpy(loss),
        }
        return loss.item()
コード例 #7
0
def test_dirac_delta(loc, n_dims):
    dist = DiracDelta(loc)
    assert dist.sample(1).shape == (1, n_dims), "{} {}".format(
        dist.sample(1).shape, n_dims)

    try:
        dist.log_prob(dist.sample(64))
    except NotImplementedError:
        pass

    samples = dist.sample(64)
    assert samples.shape == (64, n_dims)

    if n_dims == 1:
        assert dist.get_parameters()['loc'] == 1.0
    elif n_dims == 2:
        assert (dist.get_parameters()['loc'] == np.array([2.0, 3.0])).all()
    elif n_dims == 3:
        assert (dist.get_parameters()['loc'] == np.array([1.0, 2.0,
                                                          3.0])).all()
    else:
        raise ValueError()
    dist.get_parameters()
コード例 #8
0
def test_data_dist(n_dims):

    data = torch.randn(1000, n_dims)
    dist = Data(data)

    assert dist.sample(1).shape == (1, n_dims)
    assert dist.sample(64).shape == (64, n_dims)

    try:
        dist.log_prob(dist.sample(64))
    except NotImplementedError:
        pass

    assert dist.get_parameters()['n_dims'] == n_dims
    assert dist.get_parameters()['n_samples'] == 1000

    data = np.random.randn(100, n_dims)
    dist = Data(data)

    assert dist.sample(1).shape == (1, n_dims)
    assert dist.sample(64).shape == (64, n_dims)

    assert dist.get_parameters()['n_dims'] == n_dims
    assert dist.get_parameters()['n_samples'] == 100