def forward(self, queries, keys, values): # Shape of the output `queries` and `attention_weights`: # (no. of queries, no. of key-value pairs) queries = d2l.reshape( queries.repeat_interleave(keys.shape[1]), (-1, keys.shape[1])) self.attention_weights = nn.functional.softmax( -((queries - keys) * self.w)**2 / 2, dim=1) # Shape of `values`: (no. of queries, no. of key-value pairs) return torch.bmm(self.attention_weights.unsqueeze(1), values.unsqueeze(-1)).reshape(-1)
def batchify(data): max_len = max(len(c) + len(n) for _, c, n in data) centers, contexts_negatives, masks, labels = [], [], [], [] for center, context, negative in data: cur_len = len(context) + len(negative) centers += [center] contexts_negatives += [context + negative + [0] * (max_len - cur_lne)] masks += [[1] * cur_len + [0] * (max_len - len(context))] return (d2l.reshape(torch.tensor(centers), (-1, 1)), torch.tensor(contexts_negatives), torch.tensor(masks), torch.tensor(labels))
def predict_ch8(prefix, num_preds, model, vocab, device): #@save """Generate new characters following the `prefix`.""" state = model.begin_state(batch_size=1, device=device) outputs = [vocab[prefix[0]]] get_input = lambda: d2l.reshape(torch.tensor( [outputs[-1]], device=device), (1, 1)) for y in prefix[1:]: # Warm-up period _, state = model(get_input(), state) outputs.append(vocab[y]) for _ in range(num_preds): # Predict `num_preds` steps y, state = model(get_input(), state) outputs.append(int(y.argmax(dim=1).reshape(1))) return ''.join([vocab.idx_to_token[i] for i in outputs])
def predict(self, prefix, num_preds, device): #@save """Generate new characters following the `prefix`.""" τ = prefix.shape[-1] state = self.begin_state(batch_size=1, device=device) outputs = torch.zeros(prefix.shape[0], τ + num_preds, device=device) prefix = prefix.reshape(-1, 1, τ) outputs[:, 0:τ] = prefix[:, 0, :] get_input = lambda i: d2l.reshape(outputs[:, i:i + τ], (-1, 1, τ)) # for y in prefix[1:]: # Warm-up period # _, state = net(get_input(), state) for i in range(num_preds): # Predict `num_preds` steps y, state = self.forward(get_input(i), state) outputs[:, i + τ] = y.reshape(-1) return outputs
def predict_ch8(prefix, num_preds, model, vocab, device): #@save """Generate new characters following the `prefix`.""" #print(prefix[0])='t' #print(prefix[1])='i' state = model.begin_state(batch_size=1, device=device) outputs = [vocab[prefix[0]]] # =163 #print(outputs) get_input = lambda: d2l.reshape( torch.tensor([outputs[-1]], device=device), (1, 1)) # convert to shape:`num_steps*batch_size` for y in prefix[1:]: # Warm-up period, just update state without learing _, state = model(get_input(), state) outputs.append(vocab[y]) for _ in range(num_preds): # Predict `num_preds` steps y, state = model(get_input(), state) outputs.append(int(y.argmax(dim=1).reshape(1))) return ''.join([vocab.idx_to_token[i] for i in outputs])
def predict_ch8(prefix, num_preds, net, vocab, device): #@save """Generate new characters following the `prefix`.""" state = net.begin_state(batch_size=1, device=device) outputs = [vocab[prefix[0]]] get_input = lambda: d2l.reshape(d2l.tensor([outputs[-1]], device=device), (1, 1)) for y in prefix[1:]: # Warm-up period _, state = net(get_input(), state) outputs.append(vocab[y]) for _ in range(num_preds): # Predict `num_preds` steps y, state = net(get_input(), state) # sample from multinomial instead of argmax # outputs.append(int(torch.multinomial(F.softmax(y, dim=1), num_samples=1).reshape(1))) # biased α = 2, must be integer # α = 3 # outputs.append(int(torch.multinomial(F.softmax(y**α, dim=1), num_samples=1).reshape(1))) outputs.append(int(y.argmax(dim=1).reshape(1))) return ''.join([vocab.idx_to_token[i] for i in outputs])
n_test def plot_kernel_reg(y_hat): d2l.plot(x_test, [y_truth, y_hat], 'x', 'y', legend=['Truth', 'Pred'], xlim=[0, 5], ylim=[-1, 5]) d2l.plt.plot(x_train, y_train, 'o', alpha=0.5); y_hat = torch.repeat_interleave(y_train.mean(), n_test) plot_kernel_reg(y_hat) # Shape of `X_repeat`: (`n_test`, `n_train`), where each row contains the # same testing inputs (i.e., same queries) X_repeat = d2l.reshape(x_test.repeat_interleave(n_train), (-1, n_train)) # Note that `x_train` contains the keys. Shape of `attention_weights`: # (`n_test`, `n_train`), where each row contains attention weights to be # assigned among the values (`y_train`) given each query attention_weights = nn.functional.softmax(-(X_repeat - x_train)**2 / 2, dim=1) # Each element of `y_hat` is weighted average of values, where weights are # attention weights y_hat = d2l.matmul(attention_weights, y_train) plot_kernel_reg(y_hat) d2l.show_heatmaps(attention_weights.unsqueeze(0).unsqueeze(0), xlabel='Sorted training inputs', ylabel='Sorted testing inputs')
from d2l import torch as d2l import torch import torch.nn as nn T = 1000 # Generate a total of 1000 points time = torch.arange(1, T + 1, dtype=torch.float32) x = torch.sin(0.01 * time) + torch.normal(0, 0.2, (T,)) d2l.plot(time, [x], 'time', 'x', xlim=[1, 1000], figsize=(6, 3)) tau = 4 features = torch.zeros((T - tau, tau)) for i in range(tau): features[:, i] = x[i: T - tau + i] labels = d2l.reshape(x[tau:], (-1, 1)) batch_size, n_train = 16, 600 # Only the first `n_train` examples are used for training train_iter = d2l.load_array((features[:n_train], labels[:n_train]),batch_size, is_train=True) # Function for initializing the weights of the network def init_weights(m): if type(m) == nn.Linear: torch.nn.init.xavier_uniform_(m.weight) # A simple MLP def get_net(): net = nn.Sequential(nn.Linear(4, 10), nn.ReLU(), nn.Linear(10, 1)) net.apply(init_weights) return net
print(predict('time traveller')) print(predict('traveller')) # RUN SCRIPT DEBUG = False device = d2l.try_gpu() num_epochs, lr = 500, 1 if DEBUG: device = 'cpu' num_epochs, lr = 100, 1 num_hiddens = 512 # batch_size, num_steps = 32, 35 batch_size, num_steps = 32, 35 train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps) X = d2l.reshape(d2l.arange(10), (2, 5)) net = RNNModelScratch(len(vocab), num_hiddens, device, get_params, init_rnn_state, rnn) state = net.begin_state(X.shape[0], device) # Y, new_state = net(X.to(d2l.try_gpu()), state) # Y.shape, len(new_state), new_state[0].shape train_ch8(net, train_iter, vocab, lr, num_epochs, device) print(net.params[-1]) # Default Performance Benchmarks: # perplexity 1.0, 135269.0 tokens/sec on cuda:0
import torch from torch import nn from RNNModel import Numeric #@tab mxnet, pytorch T = 1000 # Generate a total of 1000 points time = d2l.arange(1, T + 1, dtype=d2l.float32) x = d2l.sin(0.01 * time) + d2l.normal(0, 0.2, (T, )) d2l.plot(time, [x], 'time', 'x', xlim=[1, 1000], figsize=(6, 3)) #@tab mxnet, pytorch tau = 30 features = d2l.zeros((T - tau, tau)) for i in range(tau): features[:, i] = x[i:T - tau + i] labels = d2l.reshape(x[tau:], (-1, 1)) batch_size = 16 n_train = 600 n_train -= n_train % batch_size # Only the first `n_train` examples are used for training train_iter = d2l.load_array((features[:n_train], labels[:n_train]), batch_size, is_train=True) # Function for initializing the weights of the network def init_weights(m): if type(m) == nn.Linear: nn.init.xavier_uniform_(m.weight)
def __init__(self, vocab_size,num_hiddens,device,\ get_params,init_state,forward_fn): self.vocab_size, self.num_hiddens = vocab_size, num_hiddens self.params = get_params(vocab_size, num_hiddens, device) self.init_state, self.forward_fn = init_state, forward_fn def __call__(self, X, state): X = F.one_hot(X.T, self.vocab_size).type(torch.float32) return self.forward_fn(X, state, self.params) def begin_state(self, batch_size, device): return self.init_state(batch_size, self.num_hiddens, device) #%% X = d2l.reshape(torch.arange(10), (2, 5)) num_hiddens = 512 model = RNNModelScatch(len(vocab), num_hiddens, d2l.try_gpu(), get_params, init_rnn_state, rnn) state = model.begin_state(X.shape[0], d2l.try_gpu()) Y, new_state = model(X.to(d2l.try_gpu()), state) Y.shape, len(new_state), new_state[0].shape # %% def predict_ch8(prefix, num_preds, model, vocab, device): """generate new characters following the prefix""" state = model.begin_state(batch_size=1, device=device) outputs = [vocab[prefix[0]]] get_input = lambda:d2l.reshape(torch.tensor(\
output.shape, len(state), state[0].shape, len(state[1]), state[1][0].shape embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.1 batch_size, num_steps = 64, 10 lr, num_epochs, device = 0.005, 250, d2l.try_gpu() train_iter, src_vocab, tgt_vocab = d2l.load_data_nmt(batch_size, num_steps) encoder = d2l.Seq2SeqEncoder(len(src_vocab), embed_size, num_hiddens, num_layers, dropout) decoder = Seq2SeqAttentionDecoder(len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout) net = d2l.EncoderDecoder(encoder, decoder) d2l.train_seq2seq(net, train_iter, lr, num_epochs, tgt_vocab, device) engs = ['go .', "i lost .", 'he\'s calm .', 'i\'m home .'] fras = ['va !', 'j\'ai perdu .', 'il est calme .', 'je suis chez moi .'] for eng, fra in zip(engs, fras): translation, dec_attention_weight_seq = d2l.predict_seq2seq( net, eng, src_vocab, tgt_vocab, num_steps, device, True) print(f'{eng} => {translation}, ', f'bleu {d2l.bleu(translation, fra, k=2):.3f}') attention_weights = d2l.reshape( d2l.concat([step[0][0][0] for step in dec_attention_weight_seq], 0), (1, 1, -1, num_steps)) # Plus one to include the end-of-sequence token d2l.show_heatmaps(attention_weights[:, :, :, :len(engs[-1].split()) + 1].cpu(), xlabel='Key posistions', ylabel='Query posistions')