Esempio n. 1
0
    def test_it(self):
        x = torch.randn(3, 4, 12)
        x.requires_grad = True

        m = TransformerDecoder(12, numheads=4, numlayers=2, noctx=True)
        mc = q.deep_copy(m)
        mc.set_cell_mode(True)

        y = m(x)
        y.norm(1).backward()

        xgrad = x.grad
        print(y.norm(1, 2))
        print(xgrad.norm(1, 2))

        # x = torch.randn(3, 4, 12)
        x = torch.tensor(x.detach().numpy() + 0.)
        x.requires_grad = True

        ys = []
        for i in range(x.size(1)):
            ys.append(mc(x[:, i].unsqueeze(1)))
        ys = torch.cat(ys, 1)
        print(ys.norm(1, 2))
        ys.norm(1).backward()

        xsgrad = x.grad
        print(xsgrad.norm(1, 2))
        self.assertTrue(
            np.allclose(y.detach().numpy(), ys.detach().numpy(), atol=1e-5))
        self.assertTrue(
            np.allclose(xgrad.detach().numpy(),
                        xsgrad.detach().numpy(),
                        atol=1e-5))
Esempio n. 2
0
    def test_it_relpos_out_of_horizon(self):
        seqlen = 7  #10
        horizon = 7
        x = torch.randn(3, seqlen, 12)
        x.requires_grad = True

        m = TransformerDecoder(12,
                               numheads=4,
                               numlayers=2,
                               noctx=True,
                               relpos=False)
        mc = q.deep_copy(m)
        mc.set_cell_mode(True)

        ys = []
        allys = []
        for i in range(horizon, seqlen + 1):
            y = m(x[:, i - horizon:i])
            allys.append(y)
            ys.append(y[:, -1].unsqueeze(1))
        ys_ref = torch.cat(ys, 1)
        print(ys_ref.size())
        ys_ref.norm(1).backward()
        allys = allys[0]

        xgrad = x.grad
        print(ys_ref.norm(1, 2))
        print(xgrad.norm(1, 2))

        # x = torch.randn(3, 4, 12)
        x = torch.tensor(x.detach().numpy() + 0.)
        x.requires_grad = True

        ys = []
        for i in range(seqlen):
            ys.append(mc(x[:, i].unsqueeze(1)))
        ys = torch.cat(ys, 1)
        print(ys.size())
        _allys = ys[:, :horizon]
        ys = ys[:, -(seqlen - horizon + 1):]
        print(ys.norm(1, 2))
        ys.norm(1).backward()
        xsgrad = x.grad
        print(xsgrad.norm(1, 2))

        print("ALL YS")
        print(allys.norm(1, 2))
        print("___")
        print(_allys.norm(1, 2))

        self.assertTrue(
            np.allclose(ys_ref.detach().numpy(),
                        ys.detach().numpy(),
                        atol=1e-5))
        self.assertTrue(
            np.allclose(xgrad.detach().numpy(),
                        xsgrad.detach().numpy(),
                        atol=1e-5))
Esempio n. 3
0
    def test_it(self):
        x = torch.randn(4, 5, 12)
        y = torch.randn(4, 5, 12)
        x.requires_grad = True
        y.requires_grad = True
        numheads = 6
        m = TS2S_arg(dim=12, numlayers=2, numheads=numheads)
        z = m(x, y)
        print(z.size())
        z[:, -1].norm(1).backward()
        xgrad = x.grad
        ygrad = y.grad
        print(xgrad.norm(1, 2))
        print(ygrad.norm(1, 2))
        zref = z

        mc = q.deep_copy(m)
        mc.set_cell_mode(True)

        x = torch.tensor(x.detach().numpy() + 0.)
        x.requires_grad = True
        y = torch.tensor(y.detach().numpy() + 0.)
        y.requires_grad = True
        print("y size: ", y.size())

        zs = []
        for i in range(y.size(1)):
            z = mc(x, y[:, i].unsqueeze(1))
            print(z.size())
            zs.append(z)

        z = torch.cat(zs, 1)
        z[:, -1].norm(1).backward()
        print(x.grad.norm(1, 2))
        print(y.grad.norm(1, 2))

        print(z.norm(1, 2), zref.norm(1, 2))
        print((z - zref).norm())
        self.assertTrue(
            np.allclose(z.detach().numpy(), zref.detach().numpy(), atol=1e-6))

        print((x.grad - xgrad).norm(1))
        self.assertTrue(
            np.allclose(ygrad.detach().numpy(),
                        y.grad.detach().numpy(),
                        atol=1e-6))
        self.assertTrue(
            np.allclose(xgrad.detach().numpy(),
                        x.grad.detach().numpy(),
                        atol=1e-5))
Esempio n. 4
0
    def test_it_relpos_out_of_horizon(self):
        seqlen = 7
        horizon = 7
        x = torch.randn(3, seqlen, 12)
        x.requires_grad = True

        m = TransformerDecoderBlock(12,
                                    numheads=4,
                                    bidir=False,
                                    noctx=True,
                                    relpos=False)
        mc = q.deep_copy(m)
        mc.set_cell_mode(True)

        ys = []
        allys = []
        for i in range(horizon, seqlen + 1):
            y = m(x[:, i - horizon:i])
            allys.append(y)
            ys.append(y[:, -1].unsqueeze(1))
        ys_ref = torch.cat(ys, 1)
        print(ys_ref.size())
        ys_ref.norm(1).backward()
        allys = allys[0]

        xgrad = x.grad
        print(ys_ref.norm(1, 2))
        # print(xgrad.norm(1, 2))

        # x = torch.randn(3, 4, 12)
        x = torch.tensor(x.detach().numpy() + 0.)
        x.requires_grad = True

        ys = []
        for i in range(seqlen):
            ys.append(mc(x[:, i].unsqueeze(1)))
        ys = torch.cat(ys, 1)
        print(ys.size())
        _allys = ys[:, :horizon]
        ys = ys[:, -(seqlen - horizon + 1):]
        print(ys.norm(1, 2))
        ys.norm(1).backward()
        xsgrad = x.grad
Esempio n. 5
0
    def test_it_relpos(self):
        x = torch.randn(4, 5, 12)
        x.requires_grad = True
        numheads = 6
        m = MultiHeadAttention(12, numheads=numheads, bidir=False, relpos=True)
        mc = q.deep_copy(m)
        mc.set_cell_mode(True)

        ys = []
        for i in range(x.size(1)):
            y = mc(x[:, i].unsqueeze(1))
            print(y.size())
            ys.append(y)

        ys = torch.cat(ys, 1)
        l = ys.sum()
        l.backward()
        xgrad = x.grad
        print(xgrad.norm(1, 2))
        m.zero_grad()
        mc.zero_grad()

        x = torch.tensor(x.detach().numpy() + 0.)
        x.requires_grad = True
        ys_ref = m(x)
        l = ys_ref.sum()
        l.backward()
        print(x.grad.norm(1, 2))

        print("norm of out diff")
        print((ys - ys_ref).norm())
        print("norm of grad diff")
        print((xgrad - x.grad).norm())

        self.assertTrue(
            np.allclose(ys.detach().numpy(),
                        ys_ref.detach().numpy(),
                        atol=1e-6))
        self.assertTrue(
            np.allclose(xgrad.detach().numpy(),
                        x.grad.detach().numpy(),
                        atol=1e-6))
Esempio n. 6
0
    def test_it_window(self):
        x = torch.randn(4, 5, 12)
        x.requires_grad = True
        numheads = 6
        m = MultiHeadAttention(12, numheads=numheads, bidir=False)
        mc = q.deep_copy(m)
        mc.set_cell_mode(True)

        ys = []
        for i in range(x.size(1)):
            y = mc(x[:, i].unsqueeze(1))
            print(y.size())
            ys.append(y)

        l = y.sum()
        l.backward(retain_graph=True)
        # TODO: check that outside window, grad on x is zero
        x.grad = None

        ys = torch.cat(ys, 1)
        l = ys[:, 2].sum()
        l.backward()
        xgrad = x.grad
        print(xgrad.norm(1, 2))

        m.zero_grad()
        mc.zero_grad()

        x = torch.tensor(x.detach().numpy() + 0.)
        x.requires_grad = True
        ys_ref = m(x)
        l = ys_ref[:, 2].sum()
        l.backward()
        print(x.grad.norm(1, 2))

        self.assertTrue(
            np.allclose(xgrad.detach().numpy(),
                        x.grad.detach().numpy()))
Esempio n. 7
0
    def tst_it_relpos_out_of_horizon_debug(self):
        seqlen = 4
        horizon = 3
        x = torch.randn(3, seqlen, 12)
        x.requires_grad = True

        m = MultiHeadAttention(12, numheads=4, bidir=False)
        mc = q.deep_copy(m)
        mc.set_cell_mode(True, horizon=horizon)

        y, k = m(x[:, 1:])
        y.norm(1).backward()

        xgrad = x.grad
        # print(ys_ref.norm(1, 2))
        # print(xgrad.norm(1, 2))

        x = torch.tensor(x.detach().numpy() + 0.)
        x.requires_grad = True

        ys = []
        ks = []
        for i in range(x.size(1)):
            ys_, k_ = mc(x[:, i].unsqueeze(1))
            ys.append(ys_)
            ks.append(k_)
        ys = torch.cat(ys, 1)
        # print(ys.norm(1, 2))

        print(y.norm(1, 2))
        print(ys.norm(1, 2))

        print(k.size())
        print(k.norm(1, 2))
        print(ks[-1].size())
        print(ks[-1].norm(1, 2))