def test_it(self): x = torch.randn(3, 4, 12) x.requires_grad = True m = TransformerDecoder(12, numheads=4, numlayers=2, noctx=True) mc = q.deep_copy(m) mc.set_cell_mode(True) y = m(x) y.norm(1).backward() xgrad = x.grad print(y.norm(1, 2)) print(xgrad.norm(1, 2)) # x = torch.randn(3, 4, 12) x = torch.tensor(x.detach().numpy() + 0.) x.requires_grad = True ys = [] for i in range(x.size(1)): ys.append(mc(x[:, i].unsqueeze(1))) ys = torch.cat(ys, 1) print(ys.norm(1, 2)) ys.norm(1).backward() xsgrad = x.grad print(xsgrad.norm(1, 2)) self.assertTrue( np.allclose(y.detach().numpy(), ys.detach().numpy(), atol=1e-5)) self.assertTrue( np.allclose(xgrad.detach().numpy(), xsgrad.detach().numpy(), atol=1e-5))
def test_it_relpos_out_of_horizon(self): seqlen = 7 #10 horizon = 7 x = torch.randn(3, seqlen, 12) x.requires_grad = True m = TransformerDecoder(12, numheads=4, numlayers=2, noctx=True, relpos=False) mc = q.deep_copy(m) mc.set_cell_mode(True) ys = [] allys = [] for i in range(horizon, seqlen + 1): y = m(x[:, i - horizon:i]) allys.append(y) ys.append(y[:, -1].unsqueeze(1)) ys_ref = torch.cat(ys, 1) print(ys_ref.size()) ys_ref.norm(1).backward() allys = allys[0] xgrad = x.grad print(ys_ref.norm(1, 2)) print(xgrad.norm(1, 2)) # x = torch.randn(3, 4, 12) x = torch.tensor(x.detach().numpy() + 0.) x.requires_grad = True ys = [] for i in range(seqlen): ys.append(mc(x[:, i].unsqueeze(1))) ys = torch.cat(ys, 1) print(ys.size()) _allys = ys[:, :horizon] ys = ys[:, -(seqlen - horizon + 1):] print(ys.norm(1, 2)) ys.norm(1).backward() xsgrad = x.grad print(xsgrad.norm(1, 2)) print("ALL YS") print(allys.norm(1, 2)) print("___") print(_allys.norm(1, 2)) self.assertTrue( np.allclose(ys_ref.detach().numpy(), ys.detach().numpy(), atol=1e-5)) self.assertTrue( np.allclose(xgrad.detach().numpy(), xsgrad.detach().numpy(), atol=1e-5))
def test_it(self): x = torch.randn(4, 5, 12) y = torch.randn(4, 5, 12) x.requires_grad = True y.requires_grad = True numheads = 6 m = TS2S_arg(dim=12, numlayers=2, numheads=numheads) z = m(x, y) print(z.size()) z[:, -1].norm(1).backward() xgrad = x.grad ygrad = y.grad print(xgrad.norm(1, 2)) print(ygrad.norm(1, 2)) zref = z mc = q.deep_copy(m) mc.set_cell_mode(True) x = torch.tensor(x.detach().numpy() + 0.) x.requires_grad = True y = torch.tensor(y.detach().numpy() + 0.) y.requires_grad = True print("y size: ", y.size()) zs = [] for i in range(y.size(1)): z = mc(x, y[:, i].unsqueeze(1)) print(z.size()) zs.append(z) z = torch.cat(zs, 1) z[:, -1].norm(1).backward() print(x.grad.norm(1, 2)) print(y.grad.norm(1, 2)) print(z.norm(1, 2), zref.norm(1, 2)) print((z - zref).norm()) self.assertTrue( np.allclose(z.detach().numpy(), zref.detach().numpy(), atol=1e-6)) print((x.grad - xgrad).norm(1)) self.assertTrue( np.allclose(ygrad.detach().numpy(), y.grad.detach().numpy(), atol=1e-6)) self.assertTrue( np.allclose(xgrad.detach().numpy(), x.grad.detach().numpy(), atol=1e-5))
def test_it_relpos_out_of_horizon(self): seqlen = 7 horizon = 7 x = torch.randn(3, seqlen, 12) x.requires_grad = True m = TransformerDecoderBlock(12, numheads=4, bidir=False, noctx=True, relpos=False) mc = q.deep_copy(m) mc.set_cell_mode(True) ys = [] allys = [] for i in range(horizon, seqlen + 1): y = m(x[:, i - horizon:i]) allys.append(y) ys.append(y[:, -1].unsqueeze(1)) ys_ref = torch.cat(ys, 1) print(ys_ref.size()) ys_ref.norm(1).backward() allys = allys[0] xgrad = x.grad print(ys_ref.norm(1, 2)) # print(xgrad.norm(1, 2)) # x = torch.randn(3, 4, 12) x = torch.tensor(x.detach().numpy() + 0.) x.requires_grad = True ys = [] for i in range(seqlen): ys.append(mc(x[:, i].unsqueeze(1))) ys = torch.cat(ys, 1) print(ys.size()) _allys = ys[:, :horizon] ys = ys[:, -(seqlen - horizon + 1):] print(ys.norm(1, 2)) ys.norm(1).backward() xsgrad = x.grad
def test_it_relpos(self): x = torch.randn(4, 5, 12) x.requires_grad = True numheads = 6 m = MultiHeadAttention(12, numheads=numheads, bidir=False, relpos=True) mc = q.deep_copy(m) mc.set_cell_mode(True) ys = [] for i in range(x.size(1)): y = mc(x[:, i].unsqueeze(1)) print(y.size()) ys.append(y) ys = torch.cat(ys, 1) l = ys.sum() l.backward() xgrad = x.grad print(xgrad.norm(1, 2)) m.zero_grad() mc.zero_grad() x = torch.tensor(x.detach().numpy() + 0.) x.requires_grad = True ys_ref = m(x) l = ys_ref.sum() l.backward() print(x.grad.norm(1, 2)) print("norm of out diff") print((ys - ys_ref).norm()) print("norm of grad diff") print((xgrad - x.grad).norm()) self.assertTrue( np.allclose(ys.detach().numpy(), ys_ref.detach().numpy(), atol=1e-6)) self.assertTrue( np.allclose(xgrad.detach().numpy(), x.grad.detach().numpy(), atol=1e-6))
def test_it_window(self): x = torch.randn(4, 5, 12) x.requires_grad = True numheads = 6 m = MultiHeadAttention(12, numheads=numheads, bidir=False) mc = q.deep_copy(m) mc.set_cell_mode(True) ys = [] for i in range(x.size(1)): y = mc(x[:, i].unsqueeze(1)) print(y.size()) ys.append(y) l = y.sum() l.backward(retain_graph=True) # TODO: check that outside window, grad on x is zero x.grad = None ys = torch.cat(ys, 1) l = ys[:, 2].sum() l.backward() xgrad = x.grad print(xgrad.norm(1, 2)) m.zero_grad() mc.zero_grad() x = torch.tensor(x.detach().numpy() + 0.) x.requires_grad = True ys_ref = m(x) l = ys_ref[:, 2].sum() l.backward() print(x.grad.norm(1, 2)) self.assertTrue( np.allclose(xgrad.detach().numpy(), x.grad.detach().numpy()))
def tst_it_relpos_out_of_horizon_debug(self): seqlen = 4 horizon = 3 x = torch.randn(3, seqlen, 12) x.requires_grad = True m = MultiHeadAttention(12, numheads=4, bidir=False) mc = q.deep_copy(m) mc.set_cell_mode(True, horizon=horizon) y, k = m(x[:, 1:]) y.norm(1).backward() xgrad = x.grad # print(ys_ref.norm(1, 2)) # print(xgrad.norm(1, 2)) x = torch.tensor(x.detach().numpy() + 0.) x.requires_grad = True ys = [] ks = [] for i in range(x.size(1)): ys_, k_ = mc(x[:, i].unsqueeze(1)) ys.append(ys_) ks.append(k_) ys = torch.cat(ys, 1) # print(ys.norm(1, 2)) print(y.norm(1, 2)) print(ys.norm(1, 2)) print(k.size()) print(k.norm(1, 2)) print(ks[-1].size()) print(ks[-1].norm(1, 2))