def predict_snli(net, vocab, premise, hypothesis): net.eval() premise = torch.tensor(vocab[premise], device=d2l.try_gpu()) hypothesis = torch.tensor(vocab[hypothesis], device=d2l.try_gpu()) label = torch.argmax(net( [premise.reshape((1, -1)), hypothesis.reshape((1, -1))]), dim=1) return 'entailment' if label == 0 else 'contradiction' if label == 1 \ else 'neutral'
def train(net_D, net_G, data_iter, num_epochs, lr, latent_dim, device=d2l.try_gpu()): loss = torch.nn.BCEWithLogitsLoss() # net_D.initialize(init=init.Normal(0.02), force_reinit=True, ctx=device) # net_G.initialize(init=init.Normal(0.02), force_reinit=True, ctx=device) trainer_hp = {'lr': lr, 'betas': [0.5, 0.999]} trainer_D = torch.optim.Adam(net_D.parameters(), **trainer_hp) trainer_G = torch.optim.Adam(net_G.parameters(), **trainer_hp) for epoch in range(1, num_epochs + 1): print('Epoch', epoch) # Train one epoch timer = d2l.Timer() metric = d2l.Accumulator(3) # loss_D, loss_G, num_examples for X, _ in data_iter: print('Processing batch') batch_size = X.shape[0] Z = torch.normal(0, 1, size=(batch_size, latent_dim, 1, 1)) # X, Z = X.as_in_ctx(device), Z.as_in_ctx(device), metric.add(update_D(X, Z, net_D, net_G, loss, trainer_D), update_G(Z, net_D, net_G, loss, trainer_G), batch_size) # Show the losses loss_D, loss_G = metric[0] / metric[2], metric[1] / metric[2] print(f'loss_D {loss_D:.3f}, loss_G {loss_G:.3f}') print(f'loss_D {loss_D:.3f}, loss_G {loss_G:.3f}, ' f'{metric[2] / timer.stop():.1f} examples/sec on {str(device)}')
def train(net, data_iter, lr, num_epochs, device=d2l.try_gpu()): def init_weights(m): if type(m) == nn.Embedding: nn.init.xavier_uniform_(m.weight) net.apply(init_weights) net = net.to(device) optimizer = torch.optim.Adam(net.parameters(), lr=lr) animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[1, num_epochs]) metric = d2l.Accumulator(2) # sum of losses, no. of tokens for epoch in range(num_epochs): timer, num_batches = d2l.Timer(), len(data_iter) for i, batch in enumerate(data_iter): optimizer.zero_grad() center, context_negative, mask, label = [ data.to(device) for data in batch ] pred = skip_gram(center, context_negative, net[0], net[1]) l = (loss(pred.reshape(label.shape).float(), label.float(), mask) / mask.sum(axis=1) * mask.shape[1]) l.sum().backward() optimizer.step() metric.add(l.sum(), l.numel()) if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1: animator.add(epoch + (i + 1) / num_batches, (metric[0] / metric[1], )) print(f'loss {metric[0] / metric[1]:.3f}, ' f'{metric[1] / timer.stop():.1f} tokens/sec on {str(device)}')
def init_run_gru( num_epochs, batch_size, num_steps, num_hiddens, lr): train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps) vocab_size, device = len(vocab), d2l.try_gpu() num_inputs = vocab_size gru_layer = nn.GRU(num_inputs, num_hiddens) model = d2l.RNNModel(gru_layer, len(vocab)) model = model.to(device) return train_ch8_slim(model, train_iter, vocab, lr, num_epochs, device)
def train_model(net, train_iter, test_iter, num_epochs, lr, device=d2l.try_gpu()): # for idx, (X, y) in enumerate(train_iter): """Train and evaluate a model with CPU or GPU.""" def init_weights(m): if type(m) == nn.Linear or type(m) == nn.Conv2d: torch.nn.init.xavier_uniform_(m.weight) # Part 2.2 net.apply(init_weights) print('training on', device) net.to(device) optimizer = torch.optim.SGD(net.parameters(), lr=lr) loss = nn.BCELoss() animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs], legend=['train loss', 'train acc', 'test acc']) timer = d2l.Timer() for epoch in range(num_epochs): metric = d2l.Accumulator(3) # train_loss, train_acc, num_examples for i, (X, y) in enumerate(train_iter): timer.start() net.train() optimizer.zero_grad() X = X.float() X, y = X.to(device), y.to(device) output = net(X) # y_hat = torch.round(torch.exp(output)/(1+torch.exp(output))) y_hat = torch.sigmoid(output) y = y.to(torch.float) y = torch.unsqueeze(y, 1) l = loss(y_hat, y.type(torch.float32)) #.type(torch.float32) l.backward() optimizer.step() with torch.no_grad(): metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0]) timer.stop() train_loss, train_acc = metric[0] / metric[2], metric[1] / metric[2] if (i + 1) % 50 == 0: animator.add(epoch + i / len(train_iter), (train_loss, train_acc, None)) print( "BatchNo.=%3i, Epoch No.=%3i, loss=%.3f, train acc=%.3f" % (i + 1, epoch + 1, train_loss, train_acc)) test_acc = evaluate_accuracy_gpu(net, test_iter) print("test_acc=", test_acc) animator.add(epoch + 1, (None, None, test_acc)) print('loss %.3f, train acc %.3f, test acc %.3f' % (train_loss, train_acc, test_acc)) print('%.1f examples/sec on %s' % (metric[2] * num_epochs / timer.sum(), device))
def train(): embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.0 batch_size, num_steps = 64, 10 lr, num_epochs, device = 0.005, 200, d2l.try_gpu() src_vocab, tgt_vocab, train_iter = d2l.load_data_nmt(batch_size, num_steps) encoder = d2l.Seq2SeqEncoder(len(src_vocab), embed_size, num_hiddens, num_layers, dropout) decoder = Seq2SeqAttentionDecoder(len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout) model = d2l.EncoderDecoder(encoder, decoder) d2l.train_s2s_ch9(model, train_iter, lr, num_epochs, device)
def train_ch6(net, train_iter, test_iter, num_epochs, lr, device=d2l.try_gpu()): """Train a model with a GPU (defined in Chapter 6).""" def init_weights(m): if type(m) == nn.Linear or type(m) == nn.Conv2d: nn.init.xavier_uniform_(m.weight) net.apply(init_weights) print('training on', device) net.to(device) optimizer = torch.optim.SGD(net.parameters(), lr=lr) loss = nn.CrossEntropyLoss() animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], legend=['train loss', 'train acc', 'test acc']) timer, num_batches = d2l.Timer(), len(train_iter) for epoch in range(num_epochs): # Sum of training loss, sum of training accuracy, no. of examples metric = d2l.Accumulator(3) net.train() for i, (X, y) in enumerate(train_iter): timer.start() optimizer.zero_grad() X, y = X.to(device), y.to(device) y_hat = net(X) l = loss(y_hat, y) l.backward() optimizer.step() with torch.no_grad(): metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0]) timer.stop() train_l = metric[0] / metric[2] train_acc = metric[1] / metric[2] if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1: animator.add(epoch + (i + 1) / num_batches, (train_l, train_acc, None)) test_acc = evaluate_accuracy_gpu(net, test_iter) animator.add(epoch + 1, (None, None, test_acc)) print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, ' f'test acc {test_acc:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(device)}') plt.show()
def train_func(net, train_iter, test_iter, num_epochs, lr, device=d2l.try_gpu()): def init_weights(m): if type(m) == nn.Linear or type(m) == nn.Conv2d: torch.nn.init.xavier_uniform_(m.weight) net.apply(init_weights) print('training on', device) net.to(device) optimizer = torch.optim.SGD(net.parameters(), lr=lr) loss = nn.CrossEntropyLoss() timer = d2l.Timer() for epoch in range(num_epochs): metric = d2l.Accumulator(3) for i, (X, y) in enumerate(train_iter): timer.start() net.train() optimizer.zero_grad() X, y = X.to(device), y.to(device) y_hat = net(X) l = loss(y_hat, y) l.backward() optimizer.step() with torch.no_grad(): metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0]) timer.stop() train_loss = metric[0] / metric[2] train_acc = metric[1] / metric[2] if (i + 1) % 50 == 0: print(f"epoch: {epoch} --- iter: {i} --- of {len(train_iter)}") print(f"train loss: {train_loss} --- train acc: {train_acc}") test_acc = evaluate_accuracy_gpu(net, test_iter) print(f'loss {train_loss:.3f}, train acc {train_acc:.3f}, ' f'test acc {test_acc:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(device)}')
H, = state outputs = [] for X in inputs: Z = torch.sigmoid((X @ W_xz) + (H @ W_hz) + b_z) R = torch.sigmoid((X @ W_xr) + (H @ W_hr) + b_r) H_tilda = torch.tanh((X @ W_xh) + ((R * H) @ W_hh) + b_h) H = Z * H + (1 - Z) * H_tilda Y = H @ W_hq + b_q outputs.append(Y) return torch.cat(outputs, dim=0), (H, ) # Hyperparameters batch_size, num_steps = 32, 35 train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps) vocab_size, num_hiddens, device = len(vocab), 256, d2l.try_gpu() num_epochs, lr = 500, 1 model = d2l.RNNModelScratch(len(vocab), num_hiddens, device, get_params, init_gru_state, gru) print('scratch model') d2l.train_ch8(model, train_iter, vocab, lr, num_epochs, device) num_inputs = vocab_size gru_layer = nn.GRU(num_inputs, num_hiddens) model = d2l.RNNModel(gru_layer, len(vocab)) model = model.to(device) print('concise model') # d2l.train_ch8(model, train_iter, vocab, lr, num_epochs, device) """ outputs and error look good
self.linear = nn.Linear(self.num_hiddens, self.vocab_size) else: self.num_directions = 2 self.linear = nn.Linear(self.num_hiddens * 2, self.vocab_size) def forward(self, inputs, state): X = F.one_hot(inputs.T.long(), self.vocab_size) X = X.to(torch.float32) Y, state = self.rnn(X, state) output = self.linear(Y.reshape(-1, Y.shape[-1])) return output, state def begin_state(self, device, batch_size=1): if not isinstance(self.rnn, nn.LSTM): return torch.zeros((self.num_directions * self.rnn.num_layers, batch_size, self.num_hiddens), device=device) else: return (torch.zeros((self.num_directions * self.rnn.num_layers, batch_size, self.num_hiddens), device=device), torch.zeros((self.num_directions * self.rnn.num_layers, batch_size, self.num_hiddens), device=device)) device = d2l.try_gpu() net = RNNModel(rnn_layer, vocab_size=len(vocab)) net = net.to(device) num_epochs, lr = 500, 1 d2l.train_ch8(net, train_iter, vocab, lr, num_epochs, device)
self.blks.add_module("block"+str(i), DecoderBlock(key_size, query_size, value_size,num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, dropout, i)) self.dense = nn.Linear(num_hiddens, vocab_size) def init_state(self, enc_outputs, env_valid_len, *args): return [enc_outputs, env_valid_len, [None]*self.num_layers] def forward(self, X, state): X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens)) for blk in self.blks: X, state = blk(X, state) return self.dense(X), state num_hiddens, num_layers, dropout, batch_size, num_steps = 32, 2, 0.1, 64,10 lr, num_epochs, device = 0.005, 200, d2l.try_gpu() ffn_num_input, ffn_num_hiddens, num_heads = 32, 64, 4 key_size, query_size, value_size = 32, 32, 32 norm_shape = [32] train_iter, src_vocab, tgt_vocab = d2l.load_data_nmt(batch_size, num_steps) encoder = TransformerEncoder( len(src_vocab), key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers, dropout) decoder = TransformerDecoder( len(tgt_vocab), key_size, query_size,value_size,num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers,dropout)
self.params = get_params(vocab_size, num_hiddens, device) self.init_state, self.forward_fn = init_state, forward_fn def __call__(self, X, state): X = F.one_hot(X.T, self.vocab_size).type(torch.float32) return self.forward_fn(X, state, self.params) def begin_state(self, batch_size, device): return self.init_state(batch_size, self.num_hiddens, device) batch_size, num_steps = 32, 35 train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps) X = torch.arange(10).reshape((2, 5)) num_hiddens = 512 net = RNNModelScratch(len(vocab), num_hiddens, d2l.try_gpu(), get_params, init_rnn_state, rnn) def predict_ch8(prefix, num_preds, net, vocab, device): state = net.begin_state(batch_size=1, device=device) outputs = [vocab[prefix[0]]] get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape( (1, 1)) for y in prefix[1:]: _, state = net(get_input(), state) outputs.append(vocab[y]) for _ in range(num_preds): y, state = net(get_input(), state) outputs.append(int(y.argmax(dim=1).reshape(1))) return "".join([vocab.idx_to_token[i] for i in outputs])
def predict_sentiment(net, vocab, sentence): sentence = torch.tensor(vocab[sentence.split()], device=d2l.try_gpu()) label = torch.argmax(net(sentence.reshape(1, -1)), dim=1) return 'positive' if label == 1 else 'negative'
if pred == vocab['<eos>']: break output_seq.append(pred) return ' '.join(vocab.to_tokens(output_seq)) if __name__ == '__main__': args = parse() model_save_file = args.model_path corpus_file = args.corpus embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.1 batch_size, num_steps = 64, 30 training_iteration = args.iteration lr, num_epochs, device = 0.005, args.num_epoch, d2l.try_gpu() training_batches, vocab = load_data(corpus_file, training_iteration, num_steps, batch_size) encoder = Seq2SeqEncoder(len(vocab), embed_size, num_hiddens, num_layers, dropout) decoder = Seq2SeqAttentionDecoder( len(vocab), embed_size, num_hiddens, num_layers, dropout) model = d2l.EncoderDecoder(encoder, decoder) src_sentence = None if model_save_file and os.path.exists(model_save_file): checkpoint = torch.load(model_save_file) start_epoch = checkpoint['epoch'] model.encoder.load_state_dict(checkpoint['en']) model.decoder.load_state_dict(checkpoint['de']) while src_sentence != "q":
metric = d2l.Accumulator(3) net.train() for i, (X, y) in enumerate(train_iter): timer.start() optimizer.zero_grad() X, y = X.to(device), y.to(device) y_hat = net(X) l = loss(y_hat, y) l.backward() optimizer.step() with torch.no_grad(): metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0]) timer.stop() train_l = metric[0] / metric[2] train_acc = metric[1] / metric[2] if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1: animator.add(epoch + (i + 1) / num_batches, (train_l, train_acc, None)) test_acc = evaluate_accuracy_gpu(net, test_iter) animator.add(epoch + 1, (None, None, test_acc)) print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, ' f'test acc {test_acc:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(device)}') # %% lr, num_epochs = 0.9, 10 train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu()) # %%
def train(resume_training=True): EMBEDDING_SIZE = 32 num_hiddens, num_layers, dropout, batch_size, num_steps = EMBEDDING_SIZE, 2, 0.1, 64, 10 lr, num_epochs, device = 0.005, 1000, d2lt.try_gpu() ffn_num_input, ffn_num_hiddens, num_heads = EMBEDDING_SIZE, 64, 4 key_size, query_size, value_size = EMBEDDING_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE norm_shape = [EMBEDDING_SIZE] ### Load data data_iter, src_vocab, tgt_vocab = load_data_nmt(batch_size, num_steps) encoder = TransformerEncoder(len(src_vocab), key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers, dropout) decoder = TransformerDecoder(len(tgt_vocab), key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers, dropout) ### Load model model = EncoderDecoder(encoder, decoder).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr) ### Load checkpoint if resume_training and PATH_MODEL.exists( ) and os.path.getsize(PATH_MODEL) > 0: model, optimizer, last_epoch = load_checkpoint(model, optimizer) print("Continue training from last checkpoint...") else: if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) with open(PATH_MODEL, 'w') as fp: pass print( 'No prior checkpoint existed, created new save files for checkpoint.' ) model.apply(xavier_init_weights) last_epoch = 0 # model.apply(xavier_init_weights) # model.to(device) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) ### Initialize Loss functions loss = MaskedSoftmaxCELoss() ### Train model.train() # animator = d2lt.Animator(xlabel='epoch', ylabel='loss', # xlim=[10, num_epochs]) for epoch in range(last_epoch, num_epochs): timer = d2lt.Timer() metric = d2lt.Accumulator(2) # Sum of training loss, no. of tokens for batch in data_iter: X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch] bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0], device=device).reshape(-1, 1) dec_input = torch.cat([bos, Y[:, :-1]], 1) # Teacher forcing Y_hat, _ = model(X, dec_input, X_valid_len) l = loss(Y_hat, Y, Y_valid_len) l.sum().backward() # Make the loss scalar for `backward` d2lt.grad_clipping(model, 1) num_tokens = Y_valid_len.sum() optimizer.step() with torch.no_grad(): metric.add(l.sum(), num_tokens) if (epoch + 1) % 10 == 0: # animator.add(epoch + 1, (metric[0] / metric[1],)) print(f'epoch {epoch + 1} - ' f'loss {metric[0] / metric[1]:.5f}') ### Save checkpoint save_checkpoint(epoch, model, optimizer) print(f'loss {metric[0] / metric[1]:.5f}, {metric[1] / timer.stop():.1f} ' f'tokens/sec on {str(device)}')
self.vocab_size, self.num_hiddens = vocab_size, num_hiddens self.params = get_params(vocab_size, num_hiddens, device) self.init_state, self.forward_fn = init_state, forward_fn def __call__(self, X, state): X = F.one_hot(X.T, self.vocab_size).type(torch.float32) return self.forward_fn(X, state, self.params) def begin_state(self, batch_size, device): return self.init_state(batch_size, self.num_hiddens, device) #%% X = d2l.reshape(torch.arange(10), (2, 5)) num_hiddens = 512 model = RNNModelScatch(len(vocab), num_hiddens, d2l.try_gpu(), get_params, init_rnn_state, rnn) state = model.begin_state(X.shape[0], d2l.try_gpu()) Y, new_state = model(X.to(d2l.try_gpu()), state) Y.shape, len(new_state), new_state[0].shape # %% def predict_ch8(prefix, num_preds, model, vocab, device): """generate new characters following the prefix""" state = model.begin_state(batch_size=1, device=device) outputs = [vocab[prefix[0]]] get_input = lambda:d2l.reshape(torch.tensor(\ [outputs[-1]],device=device),(1,1)) for y in prefix[1:]:
#%% from d2l import torch as d2l import torch from torch import nn batch_size, num_steps = 32, 35 train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps) #%% vocab_size, num_hiddens, num_layers = len(vocab), 256, 2 num_inputs = vocab_size device = d2l.try_gpu() lstm_layer = nn.LSTM(num_inputs, num_hiddens, num_layers) model = d2l.RNNModel(lstm_layer, len(vocab)) model = model.to(device) #%% num_epochs, lr = 500, 2 d2l.train_ch8(model, train_iter, vocab, lr, num_epochs, device) # %% from d2l import torch as d2l import torch from torch import nn # Load data batch_size, num_steps, device = 32, 35, d2l.try_gpu() train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps) # Define the bidirectional LSTM model by setting `bidirectional=True` vocab_size, num_hiddens, num_layers = len(vocab), 256, 2 num_inputs = vocab_size lstm_layer = nn.LSTM(num_inputs, num_hiddens, num_layers, bidirectional=True)