def predict(model, src_sentence, vocab, num_steps, device): """Predict sequences.""" src_tokens = vocab[src_sentence.lower().split(' ')] + [ vocab['<eos>']] enc_valid_len = torch.tensor([len(src_tokens)], device=device) src_tokens = d2l.truncate_pad(src_tokens, num_steps, vocab['<pad>']) # Add the batch axis enc_X = torch.unsqueeze( torch.tensor(src_tokens, dtype=torch.long, device=device), dim=0) enc_outputs = model.encoder(enc_X, enc_valid_len) dec_state = model.decoder.init_state(enc_outputs, enc_valid_len) # Add the batch axis dec_X = torch.unsqueeze(torch.tensor( [vocab['<bos>']], dtype=torch.long, device=device), dim=0) output_seq = [] for _ in range(num_steps): Y, dec_state = model.decoder(dec_X, dec_state) # We use the token with the highest prediction likelihood as the input # of the decoder at the next time step dec_X = Y.argmax(dim=2) pred = dec_X.squeeze(dim=0).type(torch.int32).item() # Once the end-of-sequence token is predicted, the generation of # the output sequence is complete if pred == vocab['<eos>']: break output_seq.append(pred) return ' '.join(vocab.to_tokens(output_seq))
def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab, num_steps, device, save_attention_weights=False): """Predict for sequence to sequence.""" # Set `net` to eval mode for inference net.eval() src_tokens = src_vocab[src_sentence.lower().split(' ')] + [ src_vocab['<eos>']] enc_valid_len = torch.tensor([len(src_tokens)], device=device) src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['<pad>']) # Add the batch axis enc_X = torch.unsqueeze( torch.tensor(src_tokens, dtype=torch.long, device=device), dim=0) enc_outputs = net.encoder(enc_X, enc_valid_len) dec_state = net.decoder.init_state(enc_outputs, enc_valid_len) # Add the batch axis dec_X = torch.unsqueeze(torch.tensor( [tgt_vocab['<bos>']], dtype=torch.long, device=device), dim=0) output_seq, attention_weight_seq = [], [] for _ in range(num_steps): Y, dec_state = net.decoder(dec_X, dec_state) # We use the token with the highest prediction likelihood as the input # of the decoder at the next time step dec_X = Y.argmax(dim=2) pred = dec_X.squeeze(dim=0).type(torch.int32).item() # Save attention weights (to be covered later) if save_attention_weights: attention_weight_seq.append(net.decoder.attention_weights) # Once the end-of-sequence token is predicted, the generation of the # output sequence is complete if pred == tgt_vocab['<eos>']: break output_seq.append(pred) return ' '.join(tgt_vocab.to_tokens(output_seq)), attention_weight_seq
def load_data_imdb(batch_size, num_steps=500): data_dir = d2l.download_extract('aclImdb','aclImdb') train_data = read_imdb(data_dir, True) test_data = read_imdb(data_dir, False) train_tokens = d2l.tokenize(train_data[0], token='word') test_tokens = d2l.tokenize(test_data[0], token='word') vocab = d2l.Vocab(train_tokens, min_freq=5) train_features = torch.tensor([d2l.truncate_pad( vocab[line], num_steps, vocab['<pad>']) for line in train_tokens]) test_features = torch.tensor([d2l.truncate_pad( vocab[line], num_steps, vocab['<pad>']) for line in test_tokens]) train_iter = d2l.load_array((train_features, torch.tensor(train_data[1])), batch_size) test_iter = d2l.load_array((test_features, torch.tensor(test_data[1])), batch_size, is_train=False) return train_iter, test_iter, vocab
def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab, num_steps, device, save_attention_weights=False, search_type='greedy'): """Predict for sequence to sequence. search type must be either 'beam' or 'greedy'""" if search_type not in ('beam', 'greedy'): raise Exception # Set `net` to eval mode for inference net.eval() src_tokens = src_vocab[src_sentence.lower().split(' ')] + [ src_vocab['<eos>']] enc_valid_len = torch.tensor([len(src_tokens)], device=device) src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['<pad>']) # Add the batch axis enc_X = torch.unsqueeze( torch.tensor(src_tokens, dtype=torch.long, device=device), dim=0) enc_outputs = net.encoder(enc_X, enc_valid_len) dec_state = net.decoder.init_state(enc_outputs, enc_valid_len) # Add the batch axis dec_X = torch.unsqueeze(torch.tensor( [tgt_vocab['<bos>']], dtype=torch.long, device=device), dim=0) predict_fn = (predict_beam, predict_greedy)[search_type=='greedy'] return predict_fn(net, dec_X, dec_state, num_steps)
def predict_s2s_ch9(model, src_sentence, src_vocab, tgt_vocab, num_steps, device): """Predict sequences (defined in Chapter 9).""" src_tokens = src_vocab[src_sentence.lower().split(' ')] + [ src_vocab['<eos>']] enc_valid_len = torch.tensor([len(src_tokens)], device=device) src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['<pad>']) # Add the batch axis enc_X = torch.unsqueeze( torch.tensor(src_tokens, dtype=torch.long, device=device), dim=0) enc_outputs = model.encoder(enc_X, enc_valid_len) dec_state = model.decoder.init_state(enc_outputs, enc_valid_len) output_seqs = [] beam_size=5 # candidate consists of candidate sequence and its score candidates = [] completed_candidates = [] for i in range(beam_size): candidates.append([[tgt_vocab['<bos>']],0]) for _ in range(num_steps): current_candidates = [] if len(completed_candidates) >= beam_size: break for i in range(len(candidates)): if candidates[i][0][-1] == tgt_vocab['<eos>']: completed_candidates.append(candidates[i]) if len(completed_candidates) >= beam_size: break else: continue Y, dec_state = model.decoder(torch.unsqueeze( torch.tensor([candidates[i][0][-1]]), dim=0),dec_state) # We use the token with the highest prediction likelihood as the input # of the decoder at the next time step topk = Y.view(-1).topk(k=beam_size,dim=-1) for prob, dec_X in zip(topk[0],topk[1]): current_candidates.append((candidates[i][0]+[dec_X.data.item()], candidates[i][1] + math.log(prob))) current_candidates.sort(key = lambda x: -x[1]) candidates = current_candidates[:beam_size] lack = beam_size-len(completed_candidates) if lack > 0: for candidate in candidates[:lack]: candidate[0].append(tgt_vocab['<eos>') completed_candidates.append(candidate) for candidate in completed_candidates: output_seqs.append(' '.join(tgt_vocab.to_tokens(candidate[0][1:-1]))) return output_seqs #%% def bleu(pred_seq, label_seq,k): pred_tokens, label_tokens = pred_seq.split(' '), label_seq.split(' ') len_pred, len_label = len(pred_tokens), len(label_tokens) score = math.exp(min(0,1-len_label/len_pred)) for n in range(1, k+1): num_matches, label_subs = 0, collections.defaultdict(int) for i in range(len_label -n + 1): label_subs[''.join(label_tokens[i:i+n])] += 1 for i in range(len_pred-n+1): if label_subs[''.join(pred_tokens[i:i+n])] > 0: num_matches += 1 label_subs[''.join(pred_tokens[i:i+n])] -= 1 score *= math.pow(num_matches/(len_pred-n+1),math.pow(0.5,n)) return score #@save def translate(engs, fras, model, src_vocab, tgt_vocab, num_steps, device): """Translate text sequences.""" for eng, fra in zip(engs, fras): translations = predict_s2s_ch9( model, eng, src_vocab, tgt_vocab, num_steps, device) for translation in translations: print( f'{eng} => {translation}, bleu {bleu(translation, fra, k=2):.3f}') engs = ['go .', "i lost .", 'i\'m home .', 'he\'s calm .'] fras = ['va !', 'j\'ai perdu .', 'je suis chez moi .', 'il est calme .'] #%% output_seqs = translate(engs, fras, model, src_vocab, tgt_vocab, num_steps, device) # %% for output in output_seqs: print(output)
def _pad(self, lines): return torch.tensor([ d2l.truncate_pad(self.vocab[line], self.num_steps, self.vocab['<pad>']) for line in lines ])
#%% train_data = read_imdb(data_dir, is_train=True) print('# trainings:', len(train_data[0])) for x, y in zip(train_data[0][:3], train_data[1][:3]): print('label:',y,'review',x[0:60]) # %% train_tokens = d2l.tokenize(train_data[0], token='word') vocab = d2l.Vocab(train_tokens, min_freq=5, reserved_tokens=['<pad>']) d2l.set_figsize() d2l.plt.hist([len(line) for line in train_tokens], bins=range(0,1000,50)) #%% num_steps = 500 train_features = torch.tensor([d2l.truncate_pad( vocab[line], num_steps, vocab['<pad>']) for line in train_tokens]) print(train_features.shape) #%% train_iter = d2l.load_array((train_features, torch.tensor(train_data[1])), 64) for X, y in train_iter: print('X:', X.shape, ',y:', y.shape) break print('#batches:', len(train_iter)) #%% def read_imdb(data_dir, is_train): data, labels = [], [] for label in ('pos','neg'): folder_name = os.path.join(data_dir, 'train' if is_train else 'test', label)