Example #1
0
def main():
    # lstm=LayeredLSTM()
    lstm = LSTM(input_size=47764,
                hidden_size=512,
                num_layers=8,
                batch_first=True)

    # this has no new sequence reset
    # I wonder if gradient information will increase indefinitely

    # Even so, I think detaching at the beginning of each new sequence is an arbitrary decision.
    optim = torch.optim.Adam(lstm.parameters())
    lstm.cuda()
    criterion = torch.nn.SmoothL1Loss()

    cm = ChannelManager()
    cm.add_channels(2)
    cm.cat_call("init_states")

    for i in range(2000):
        print(i)
        optim.zero_grad()
        target = Variable(torch.rand(2, 1, 512)).cuda()
        output, states = lstm(cm.cat_call("get_input"),
                              cm.cat_call("get_states", 1))
        cm.distribute_call("push_states", states, dim=1)
        loss = criterion(output, target)
        loss.backward()
        optim.step()

        if i % 3 == 0:
            cm[0].new_sequence_reset()

        if i % 5 == 0:
            cm[1].new_sequence_reset()
Example #2
0
def main0():
    # lstm=LayeredLSTM()
    lstm = LSTM(input_size=47764,
                hidden_size=128,
                num_layers=8,
                batch_first=True)

    # this has no new sequence reset
    # I wonder if gradient information will increase indefinitely

    # Even so, I think detaching at the beginning of each new sequence is an arbitrary decision.
    optim = torch.optim.Adam(lstm.parameters())
    lstm.cuda()
    h0 = Variable(torch.rand(8, 2, 128)).cuda()
    c0 = Variable(torch.rand(8, 2, 128)).cuda()
    states = (h0, c0)
    savedetach = [states]

    sd(savedetach)

    for m in range(1000):
        for _ in range(10):
            print(_)
            optim.zero_grad()
            input = Variable(torch.rand(2, 1, 47764)).cuda()
            target = Variable(torch.rand(2, 1, 128)).cuda()
            output, states = lstm(input, savedetach[-1])

            savedetach.append(states)
            sd(savedetach)
            criterion = torch.nn.SmoothL1Loss()
            loss = criterion(output, target)
            loss.backward()
            optim.step()

        for i in range(len(savedetach)):
            del savedetach[0]
        h0 = Variable(torch.rand(8, 128, 128)).cuda()
        c0 = Variable(torch.rand(8, 128, 128)).cuda()
        states = (h0, c0)
        savedetach += [states]
Example #3
0
class SimpleDPLSTMTest(unittest.TestCase):
    def setUp(self):
        self.SEQ_LENGTH = 20
        self.INPUT_DIM = 25
        self.MINIBATCH_SIZE = 30
        self.LSTM_OUT_DIM = 12
        self.NUM_LAYERS = 1
        self.bidirectional = False
        self.batch_first = False

        self.num_directions = 2 if self.bidirectional else 1
        self.h_init = torch.randn(
            self.NUM_LAYERS * self.num_directions,
            self.MINIBATCH_SIZE,
            self.LSTM_OUT_DIM,
        )
        self.c_init = torch.randn(
            self.NUM_LAYERS * self.num_directions,
            self.MINIBATCH_SIZE,
            self.LSTM_OUT_DIM,
        )

        self.original_lstm = LSTM(
            self.INPUT_DIM,
            self.LSTM_OUT_DIM,
            batch_first=self.batch_first,
            num_layers=self.NUM_LAYERS,
            bidirectional=self.bidirectional,
        )
        self.dp_lstm = DPLSTM(
            self.INPUT_DIM,
            self.LSTM_OUT_DIM,
            batch_first=self.batch_first,
            num_layers=self.NUM_LAYERS,
            bidirectional=self.bidirectional,
        )

        self.dp_lstm.load_state_dict(self.original_lstm.state_dict())

    def _reset_seeds(self):
        torch.manual_seed(1337)
        torch.cuda.manual_seed(1337)

    def test_lstm_forward(self):
        x = (
            torch.randn(self.MINIBATCH_SIZE, self.SEQ_LENGTH, self.INPUT_DIM)
            if self.batch_first
            else torch.randn(self.SEQ_LENGTH, self.MINIBATCH_SIZE, self.INPUT_DIM)
        )
        hidden = (self.h_init, self.c_init)

        out, (hn, cn) = self.original_lstm(x, hidden)
        dp_out, (dp_hn, dp_cn) = self.dp_lstm(x, hidden)

        outputs_to_test = [
            (out, dp_out, "LSTM and DPLSTM output"),
            (hn, dp_hn, "LSTM and DPLSTM state `h`"),
            (cn, dp_cn, "LSTM and DPLSTM state `c`"),
        ]

        for output, dp_output, message in outputs_to_test:
            assert_allclose(
                actual=dp_output.expand_as(output),
                expected=output,
                atol=10e-6,
                rtol=10e-5,
                msg=f"Tensor value mismatch between {message}",
            )

    def test_lstm_backward(self):
        x = (
            torch.randn(self.MINIBATCH_SIZE, self.SEQ_LENGTH, self.INPUT_DIM)
            if self.batch_first
            else torch.randn(self.SEQ_LENGTH, self.MINIBATCH_SIZE, self.INPUT_DIM)
        )
        criterion = nn.MSELoss()

        hidden = (self.h_init, self.c_init)

        out, (hn, cn) = self.original_lstm(x, hidden)
        y = torch.zeros_like(out)
        loss = criterion(out, y)
        loss.backward()

        dp_out, (dp_hn, dp_cn) = self.dp_lstm(x, hidden)
        dp_loss = criterion(dp_out, y)
        dp_loss.backward()

        dp_lstm_params = dict(self.dp_lstm.named_parameters())
        for param_name, param in self.original_lstm.named_parameters():
            dp_param = dp_lstm_params[param_name]
            assert_allclose(
                actual=dp_param,
                expected=param,
                atol=10e-5,
                rtol=10e-3,
                msg=f"Tensor value mismatch in the parameter '{param_name}'",
            )
            assert_allclose(
                actual=dp_param.grad,
                expected=param.grad,
                atol=10e-6,
                rtol=10e-5,
                msg=f"Tensor value mismatch in the gradient of parameter '{param_name}'",
            )

    def test_lstm_param_update(self):
        x = (
            torch.randn(self.MINIBATCH_SIZE, self.SEQ_LENGTH, self.INPUT_DIM)
            if self.batch_first
            else torch.randn(self.SEQ_LENGTH, self.MINIBATCH_SIZE, self.INPUT_DIM)
        )
        criterion = nn.MSELoss()

        optimizer = torch.optim.SGD(self.original_lstm.parameters(), lr=0.5)
        dp_optimizer = torch.optim.SGD(self.dp_lstm.parameters(), lr=0.5)

        # Train original LSTM for one step
        logits, (h_n, c_n) = self.original_lstm(x)
        y = torch.zeros_like(logits)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        # Train DP LSTM for one step
        dp_logits, (dp_h_n, dp_c_n) = self.dp_lstm(x)
        dp_loss = criterion(dp_logits, y)
        dp_loss.backward()
        dp_optimizer.step()

        dp_lstm_params = dict(self.dp_lstm.named_parameters())
        for param_name, param in self.original_lstm.named_parameters():
            dp_param = dp_lstm_params[param_name]
            assert_allclose(
                actual=dp_param,
                expected=param,
                atol=10e-6,
                rtol=10e-5,
                msg=f"Tensor value mismatch in the parameter '{param_name}'",
            )
            assert_allclose(
                actual=dp_param.grad,
                expected=param.grad,
                atol=10e-6,
                rtol=10e-5,
                msg=f"Tensor value mismatch in the gradient of parameter '{param_name}'",
            )