def __init__(self, paragraphs, max_len): # Input `paragraphs[i]` is a list of sentence strings representing a # paragraph; while output `paragraphs[i]` is a list of sentences # representing a paragraph, where each sentence is a list of tokens paragraphs = [ d2l.tokenize(paragraph, token='word') for paragraph in paragraphs ] sentences = [ sentence for paragraph in paragraphs for sentence in paragraph ] self.vocab = d2l.Vocab( sentences, min_freq=5, reserved_tokens=['<pad>', '<mask>', '<cls>', '<sep>']) # Get data for the next sentence prediction task examples = [] for paragraph in paragraphs: examples.extend( _get_nsp_data_from_paragraph(paragraph, paragraphs, self.vocab, max_len)) # Get data for the masked language model task examples = [(_get_mlm_data_from_tokens(tokens, self.vocab) + (segments, is_next)) for tokens, segments, is_next in examples] # Pad inputs (self.all_token_ids, self.all_segments, self.valid_lens, self.all_pred_positions, self.all_mlm_weights, self.all_mlm_labels, self.nsp_labels) = _pad_bert_inputs(examples, max_len, self.vocab)
def load_data_imdb(batch_size, num_steps=500): data_dir = d2l.download_extract('aclImdb','aclImdb') train_data = read_imdb(data_dir, True) test_data = read_imdb(data_dir, False) train_tokens = d2l.tokenize(train_data[0], token='word') test_tokens = d2l.tokenize(test_data[0], token='word') vocab = d2l.Vocab(train_tokens, min_freq=5) train_features = torch.tensor([d2l.truncate_pad( vocab[line], num_steps, vocab['<pad>']) for line in train_tokens]) test_features = torch.tensor([d2l.truncate_pad( vocab[line], num_steps, vocab['<pad>']) for line in test_tokens]) train_iter = d2l.load_array((train_features, torch.tensor(train_data[1])), batch_size) test_iter = d2l.load_array((test_features, torch.tensor(test_data[1])), batch_size, is_train=False) return train_iter, test_iter, vocab
def load_corpus_war_of_the_worlds(max_tokens=-1): """Return token indices and the vocabulary of the time machine dataset.""" lines = read_war_of_the_worlds() tokens = d2l.tokenize(lines, 'char') vocab = d2l.Vocab(tokens) # Since each text line in the time machine dataset is not necessarily a # sentence or a paragraph, flatten all the text lines into a single list corpus = [vocab[token] for line in tokens for token in line] if max_tokens > 0: corpus = corpus[:max_tokens] return corpus, vocab
def __init__(self, dataset, num_steps, vocab=None): self.num_steps = num_steps all_premise_tokens = d2l.tokenize(dataset[0]) if vocab is None: self.vocab = d2l.Vocab(all_premise_tokens + all_premise_tokens, min_freq=5, reserved_tokens=['<pad>']) else: self.vocab = vocab self.premises = self._pad(all_premise_tokens) self.hypotheses = self._pad(all_premise_tokens) self.labels = torch.tensor(dataset[2]) print('read ' + str(len(self.premises)) + ' examples')
def __init__(self, dataset, max_len, vocab=None): all_premise_hypothesis_tokens = [[ p_tokens, h_tokens ] for p_tokens, h_tokens in zip(*[ d2l.tokenize([s.lower() for s in sentences]) for sentences in dataset[:2] ])] self.labels = torch.tensor(dataset[2]) self.vocab = vocab self.max_len = max_len (self.all_token_ids, self.all_segments, self.valid_lens) = self._preprocess(all_premise_hypothesis_tokens) print('read ' + str(len(self.all_token_ids)) + ' examples')
import collections import re from d2l import torch as d2l import random import torch tokens = d2l.tokenize(d2l.read_time_machine()) corpus = [token for line in tokens for token in line] vocab = d2l.Vocab(corpus) freqs = [freq for _, freq in vocab.token_freqs] bigram_tokens = [pair for pair in zip(corpus[:-1], corpus[1:])] bigram_vocab = d2l.Vocab(bigram_tokens) print(bigram_vocab.token_freqs[:10]) bifreqs = [freq for _, freq in bigram_vocab.token_freqs] trigram_tokens = [tup for tup in zip(corpus[:-2], corpus[1:-1], corpus[2:])] trigram_vocab = d2l.Vocab(trigram_tokens) print(trigram_vocab.token_freqs[:10]) trifreqs = [freq for _, freq in trigram_vocab.token_freqs] d2l.plot([freqs, bifreqs, trifreqs], xlabel="token: x", ylabel="frequency: n(x)", xscale="log", yscale="log", legend=["unigram", "bigram", "trigram"]) d2l.plt.show() def seq_data_iter_random(corpus, batch_size, num_steps):
folder_name = os.path.join(data_dir, 'train' if is_train else 'test', label) for file in os.listdir(folder_name): with open(os.path.join(folder_name, file), 'rb') as f: review = f.read().decode('utf-8').replace('\n','') data.append(review) labels.append(1 if label == 'pos' else 0) return data, labels #%% train_data = read_imdb(data_dir, is_train=True) print('# trainings:', len(train_data[0])) for x, y in zip(train_data[0][:3], train_data[1][:3]): print('label:',y,'review',x[0:60]) # %% train_tokens = d2l.tokenize(train_data[0], token='word') vocab = d2l.Vocab(train_tokens, min_freq=5, reserved_tokens=['<pad>']) d2l.set_figsize() d2l.plt.hist([len(line) for line in train_tokens], bins=range(0,1000,50)) #%% num_steps = 500 train_features = torch.tensor([d2l.truncate_pad( vocab[line], num_steps, vocab['<pad>']) for line in train_tokens]) print(train_features.shape) #%% train_iter = d2l.load_array((train_features, torch.tensor(train_data[1])), 64) for X, y in train_iter: print('X:', X.shape, ',y:', y.shape)