def test_grad_grad_viterbi(operator): states, emissions, theta = make_data(10) theta = torch.from_numpy(theta) theta = theta[:, None, :, :] theta.requires_grad_() viterbi = Viterbi(operator) gradgradcheck(viterbi, (theta, ))
def test_viterbi(operator): states, emissions, theta = make_data(10) theta = torch.from_numpy(theta) theta.requires_grad_() W = theta[:, None, :, :] viterbi = Viterbi(operator) v = viterbi(W) s = v.sum() s.backward() decoded = torch.argmax(theta.grad.sum(dim=2), dim=1).numpy() assert np.all(decoded == states)
def test_hessian_viterbi(operator): torch.manual_seed(0) states, emissions, theta = make_data(10) theta /= 100 theta = torch.from_numpy(theta) theta = theta[:, None, :, :] theta.requires_grad_() viterbi = Viterbi(operator) ll = viterbi(theta) g, = torch.autograd.grad(ll, (theta, ), create_graph=True) z = torch.randn_like(g) s = torch.sum(g * z) s.backward() assert theta.grad.shape == (10, 1, 3, 3)
def test_grad_hessian_viterbi_two_samples(operator): states1, emissions1, theta1 = make_data(10) states2, emissions2, theta2 = make_data(5) lengths = torch.LongTensor([10, 5]) theta1 = torch.from_numpy(theta1) theta2 = torch.from_numpy(theta2) theta1.requires_grad_() theta2.requires_grad_() viterbi = Viterbi(operator) def func(theta1_, theta2_): W = pad_sequence([theta1_, theta2_]) return viterbi(W, lengths) gradcheck(func, (theta1, theta2)) gradgradcheck(func, (theta1, theta2))
def test_viterbi_two_lengths(operator): states1, emissions1, theta1 = make_data(10) states2, emissions2, theta2 = make_data(5) lengths = torch.LongTensor([10, 5]) theta1 = torch.from_numpy(theta1) theta2 = torch.from_numpy(theta2) theta1.requires_grad_() theta2.requires_grad_() W = pad_sequence([theta1, theta2]) viterbi = Viterbi(operator) v = viterbi(W, lengths=lengths) s = v.sum() s.backward() decoded1 = torch.argmax(theta1.grad.sum(dim=2), dim=1).numpy() decoded2 = torch.argmax(theta2.grad.sum(dim=2), dim=1).numpy() assert np.all(decoded1 == states1) assert np.all(decoded2 == states2)
length = 100 batch_size = 256 n_targets = 32 n_features = 100 gpu = True operator = 'sparsemax' if gpu and torch.cuda.is_available(): device = torch.device('cuda:0') else: device = torch.device('cpu') X = torch.FloatTensor(length, batch_size, n_features).uniform_() viterbi = Viterbi(operator=operator) linear_potential = LinearPotential(n_features, n_targets) theta = linear_potential(X) theta = theta.detach() theta = theta.to(device) theta.requires_grad_() z = torch.randn_like(theta) value = torch.sum(viterbi(theta)) g, = torch.autograd.grad(value, (theta, ), create_graph=True) s = torch.sum(g * z) with torch.autograd.profiler.profile(use_cuda=True) as prof: s.backward() print('Value', value.item()) print('|g|', torch.sum(torch.abs(theta.grad)).item()) prof.export_chrome_trace('prof.txt')