Example #1
0
 def __init__(self, paragraphs, max_len):
     # Input `paragraphs[i]` is a list of sentence strings representing a
     # paragraph; while output `paragraphs[i]` is a list of sentences
     # representing a paragraph, where each sentence is a list of tokens
     paragraphs = [
         d2l.tokenize(paragraph, token='word') for paragraph in paragraphs
     ]
     sentences = [
         sentence for paragraph in paragraphs for sentence in paragraph
     ]
     self.vocab = d2l.Vocab(
         sentences,
         min_freq=5,
         reserved_tokens=['<pad>', '<mask>', '<cls>', '<sep>'])
     # Get data for the next sentence prediction task
     examples = []
     for paragraph in paragraphs:
         examples.extend(
             _get_nsp_data_from_paragraph(paragraph, paragraphs, self.vocab,
                                          max_len))
     # Get data for the masked language model task
     examples = [(_get_mlm_data_from_tokens(tokens, self.vocab) +
                  (segments, is_next))
                 for tokens, segments, is_next in examples]
     # Pad inputs
     (self.all_token_ids, self.all_segments, self.valid_lens,
      self.all_pred_positions, self.all_mlm_weights, self.all_mlm_labels,
      self.nsp_labels) = _pad_bert_inputs(examples, max_len, self.vocab)
def load_data_imdb(batch_size, num_steps=500):
    data_dir = d2l.download_extract('aclImdb','aclImdb')
    train_data = read_imdb(data_dir, True)
    test_data = read_imdb(data_dir, False)
    train_tokens = d2l.tokenize(train_data[0], token='word')
    test_tokens = d2l.tokenize(test_data[0], token='word')
    vocab = d2l.Vocab(train_tokens, min_freq=5)
    train_features = torch.tensor([d2l.truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
    test_features = torch.tensor([d2l.truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in test_tokens])
    train_iter = d2l.load_array((train_features, torch.tensor(train_data[1])),
                                batch_size)
    test_iter = d2l.load_array((test_features, torch.tensor(test_data[1])),
                               batch_size,
                               is_train=False)
    return train_iter, test_iter, vocab
Example #3
0
def load_corpus_war_of_the_worlds(max_tokens=-1):
    """Return token indices and the vocabulary of the time machine dataset."""
    lines = read_war_of_the_worlds()
    tokens = d2l.tokenize(lines, 'char')
    vocab = d2l.Vocab(tokens)
    # Since each text line in the time machine dataset is not necessarily a
    # sentence or a paragraph, flatten all the text lines into a single list
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab
 def __init__(self, dataset, num_steps, vocab=None):
     self.num_steps = num_steps
     all_premise_tokens = d2l.tokenize(dataset[0])
     if vocab is None:
         self.vocab = d2l.Vocab(all_premise_tokens + all_premise_tokens,
                                min_freq=5,
                                reserved_tokens=['<pad>'])
     else:
         self.vocab = vocab
     self.premises = self._pad(all_premise_tokens)
     self.hypotheses = self._pad(all_premise_tokens)
     self.labels = torch.tensor(dataset[2])
     print('read ' + str(len(self.premises)) + ' examples')
Example #5
0
    def __init__(self, dataset, max_len, vocab=None):
        all_premise_hypothesis_tokens = [[
            p_tokens, h_tokens
        ] for p_tokens, h_tokens in zip(*[
            d2l.tokenize([s.lower() for s in sentences])
            for sentences in dataset[:2]
        ])]

        self.labels = torch.tensor(dataset[2])
        self.vocab = vocab
        self.max_len = max_len
        (self.all_token_ids, self.all_segments,
         self.valid_lens) = self._preprocess(all_premise_hypothesis_tokens)
        print('read ' + str(len(self.all_token_ids)) + ' examples')
Example #6
0
import collections
import re
from d2l import torch as d2l
import random
import torch

tokens = d2l.tokenize(d2l.read_time_machine())
corpus = [token for line in tokens for token in line]
vocab = d2l.Vocab(corpus)
freqs = [freq for _, freq in vocab.token_freqs]

bigram_tokens = [pair for pair in zip(corpus[:-1], corpus[1:])]
bigram_vocab = d2l.Vocab(bigram_tokens)
print(bigram_vocab.token_freqs[:10])
bifreqs = [freq for _, freq in bigram_vocab.token_freqs]

trigram_tokens = [tup for tup in zip(corpus[:-2], corpus[1:-1], corpus[2:])]
trigram_vocab = d2l.Vocab(trigram_tokens)
print(trigram_vocab.token_freqs[:10])
trifreqs = [freq for _, freq in trigram_vocab.token_freqs]

d2l.plot([freqs, bifreqs, trifreqs],
         xlabel="token: x",
         ylabel="frequency: n(x)",
         xscale="log",
         yscale="log",
         legend=["unigram", "bigram", "trigram"])
d2l.plt.show()


def seq_data_iter_random(corpus, batch_size, num_steps):
        folder_name = os.path.join(data_dir, 'train' if is_train else 'test',
                                    label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n','')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data, labels
#%%
train_data = read_imdb(data_dir, is_train=True)
print('# trainings:', len(train_data[0]))
for x, y in zip(train_data[0][:3], train_data[1][:3]):
    print('label:',y,'review',x[0:60])

# %%
train_tokens = d2l.tokenize(train_data[0], token='word')
vocab = d2l.Vocab(train_tokens, min_freq=5, reserved_tokens=['<pad>'])

d2l.set_figsize()
d2l.plt.hist([len(line) for line in train_tokens], bins=range(0,1000,50))

#%%
num_steps = 500
train_features = torch.tensor([d2l.truncate_pad(
    vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
print(train_features.shape)

#%%
train_iter = d2l.load_array((train_features, torch.tensor(train_data[1])), 64)
for X, y in train_iter:
    print('X:', X.shape, ',y:', y.shape)