Beispiel #1
0
def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device):
    """Train a model for sequence to sequence."""
    def xavier_init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])

    net.apply(xavier_init_weights)
    net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    net.train()
    animator = d2l.Animator(xlabel='epoch',
                            ylabel='loss',
                            xlim=[10, num_epochs])
    for epoch in range(num_epochs):
        timer = d2l.Timer()
        metric = d2l.Accumulator(2)  # Sum of training loss, no. of tokens
        first = True
        for i, batch in enumerate(data_iter):
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
                               device=device).reshape(-1, 1)

            # 4. In training, replace teacher forcing with feeding the prediction at the previous time step into the
            # decoder. How does this influence the performance?

            if first:
                dec_input = d2l.concat([bos, Y[:, :-1]], 1)  # Teacher forcing
                first = False
            else:
                dec_input = Y_hat.argmax(dim=2)
                dec_input = dec_input[:X.shape[0]]

            Y_hat, _ = net(X, dec_input, X_valid_len)
            l = loss(Y_hat, Y, Y_valid_len)
            l.sum().backward()  # Make the loss scalar for `backward`
            d2l.grad_clipping(net, 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()
            with torch.no_grad():
                metric.add(l.sum(), num_tokens)
        if (epoch + 1) % 10 == 0:
            animator.add(epoch + 1, (metric[0] / metric[1], ))
    print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} '
          f'tokens/sec on {str(device)}')
Beispiel #2
0
def train_s2s_ch9(model, data_iter, lr, num_epochs, tgt_vocab, device):
    """Train a model for sequence to sequence (defined in Chapter 9)."""
    def xavier_init_weights(m):
        if type(m) == nn.Linear:
            torch.nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    torch.nn.init.xavier_uniform_(m._parameters[param])

    model.apply(xavier_init_weights)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    model.train()
    # animator = d2l.Animator(xlabel='epoch', ylabel='loss',
    #                         xlim=[10, num_epochs])
    for epoch in range(num_epochs):
        timer = d2l.Timer()
        metric = d2l.Accumulator(2)  # Sum of training loss, no. of tokens
        for batch in data_iter:
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
                               device=device).reshape(-1, 1)
            dec_input = torch.cat([bos, Y[:, :-1]], 1)  # Teacher forcing
            Y_hat, _ = model(X, dec_input, X_valid_len)
            l = loss(Y_hat, Y, Y_valid_len)
            l.sum().backward()  # Make the loss scalar for `backward`
            d2l.grad_clipping(model, 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()
            with torch.no_grad():
                metric.add(l.sum(), num_tokens)
        # if (epoch + 1) % 10 == 0:
        #     animator.add(epoch + 1, (metric[0] / metric[1],))
    print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} '
          f'tokens/sec on {str(device)}')
Beispiel #3
0
def train(model,
          training_batches,
          lr,
          vocab,
          device,
          model_save_dir,
          model_save_file=None):
    print("Training...")

    def xavier_init_weights(m):
        if type(m) == nn.Linear:
            torch.nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    torch.nn.init.xavier_uniform_(m._parameters[param])

    model.apply(xavier_init_weights)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()

    start_epoch = 0

    if model_save_file and os.path.exists(model_save_file):
        checkpoint = torch.load(model_save_file)
        start_epoch = checkpoint['epoch']
        model.encoder.load_state_dict(checkpoint['en'])
        model.decoder.load_state_dict(checkpoint['de'])
        optimizer.load_state_dict(checkpoint['opt'])
    model.train()
    model.to(device)
    start = time.time()

    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)

    for epoch in range(start_epoch, len(training_batches), 1):
        batch = training_batches[epoch]
        X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
        bos = torch.tensor([vocab['<bos>']] * Y.shape[0],
                           device=device).reshape(-1, 1)
        dec_input = d2l.concat([bos, Y[:, :-1]], 1)  # Teacher forcing
        Y_hat, _ = model(X, dec_input, X_valid_len)
        l = loss(Y_hat, Y, Y_valid_len)
        l.sum().backward()  # Make the loss scalar for `backward`
        d2l.grad_clipping(model, 1)
        optimizer.step()
        print("Progress:{%.2f}%% Total time: %.2f s" % (round(
            (epoch + 1) * 100 / len(training_batches)), time.time() - start),
              end="\r")
        if (epoch + 1) % 100 == 0:
            torch.save(
                {
                    'epoch': epoch + 1,
                    'en': model.encoder.state_dict(),
                    'de': model.decoder.state_dict(),
                    'opt': optimizer.state_dict(),
                    'loss': loss,
                },
                os.path.join(model_save_dir,
                             '{}_{}.tar'.format(epoch + 1, MODEL_FILE_NAME)))
    print(model)
Beispiel #4
0
def train(resume_training=True):

    EMBEDDING_SIZE = 32
    num_hiddens, num_layers, dropout, batch_size, num_steps = EMBEDDING_SIZE, 2, 0.1, 64, 10
    lr, num_epochs, device = 0.005, 1000, d2lt.try_gpu()
    ffn_num_input, ffn_num_hiddens, num_heads = EMBEDDING_SIZE, 64, 4
    key_size, query_size, value_size = EMBEDDING_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE
    norm_shape = [EMBEDDING_SIZE]

    ### Load data
    data_iter, src_vocab, tgt_vocab = load_data_nmt(batch_size, num_steps)
    encoder = TransformerEncoder(len(src_vocab), key_size, query_size,
                                 value_size, num_hiddens, norm_shape,
                                 ffn_num_input, ffn_num_hiddens, num_heads,
                                 num_layers, dropout)
    decoder = TransformerDecoder(len(tgt_vocab), key_size, query_size,
                                 value_size, num_hiddens, norm_shape,
                                 ffn_num_input, ffn_num_hiddens, num_heads,
                                 num_layers, dropout)

    ### Load model
    model = EncoderDecoder(encoder, decoder).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    ### Load checkpoint
    if resume_training and PATH_MODEL.exists(
    ) and os.path.getsize(PATH_MODEL) > 0:
        model, optimizer, last_epoch = load_checkpoint(model, optimizer)
        print("Continue training from last checkpoint...")
    else:
        if not os.path.exists(checkpoint_path):
            os.makedirs(checkpoint_path)
        with open(PATH_MODEL, 'w') as fp:
            pass
        print(
            'No prior checkpoint existed, created new save files for checkpoint.'
        )
        model.apply(xavier_init_weights)
        last_epoch = 0

    # model.apply(xavier_init_weights)
    # model.to(device)
    # optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    ### Initialize Loss functions
    loss = MaskedSoftmaxCELoss()

    ### Train
    model.train()
    # animator = d2lt.Animator(xlabel='epoch', ylabel='loss',
    # xlim=[10, num_epochs])
    for epoch in range(last_epoch, num_epochs):
        timer = d2lt.Timer()
        metric = d2lt.Accumulator(2)  # Sum of training loss, no. of tokens
        for batch in data_iter:
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
                               device=device).reshape(-1, 1)
            dec_input = torch.cat([bos, Y[:, :-1]], 1)  # Teacher forcing
            Y_hat, _ = model(X, dec_input, X_valid_len)
            l = loss(Y_hat, Y, Y_valid_len)
            l.sum().backward()  # Make the loss scalar for `backward`
            d2lt.grad_clipping(model, 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()
            with torch.no_grad():
                metric.add(l.sum(), num_tokens)
        if (epoch + 1) % 10 == 0:
            # animator.add(epoch + 1, (metric[0] / metric[1],))
            print(f'epoch {epoch + 1} - ' f'loss {metric[0] / metric[1]:.5f}')

        ### Save checkpoint
        save_checkpoint(epoch, model, optimizer)
    print(f'loss {metric[0] / metric[1]:.5f}, {metric[1] / timer.stop():.1f} '
          f'tokens/sec on {str(device)}')