def test_vocab_getitem(mock_open): mock_open.return_value = StringIO(_FAKE_VOCAB_DATA) vocab = Vocab(vocab_path=None, unk_token='[UNK]', bos_token='[BOS]', eos_token='[EOS]', pad_token='[PAD]') # Get index by token. assert vocab['[BOS]'] == 0 assert vocab['[EOS]'] == 1 assert vocab['[PAD]'] == 2 assert vocab['[UNK]'] == 3 assert vocab['TOKEN#1'] == 4 assert vocab['TOKEN#2'] == 5 assert vocab['TOKEN#3'] == 6 assert vocab['TOKEN#4'] == 7 # Get token by index. assert vocab[0] == '[BOS]' assert vocab[1] == '[EOS]' assert vocab[2] == '[PAD]' assert vocab[3] == '[UNK]' assert vocab[4] == 'TOKEN#1' assert vocab[5] == 'TOKEN#2' assert vocab[6] == 'TOKEN#3' assert vocab[7] == 'TOKEN#4'
def test_tokenizer_decode(mock_open): mock_open.return_value = StringIO(_FAKE_VOCAB_DATA) vocab = Vocab(vocab_path=None, unk_token='[UNK]') tokenizer = Tokenizer(vocab) assert (tokenizer.decode(['he', '##llo', 'wo', '##r', '##l', '##d']) == 'hello world')
def test_vocab_len(mock_open): mock_open.return_value = StringIO(_FAKE_VOCAB_DATA) vocab = Vocab(vocab_path=None, unk_token='[UNK]', bos_token='[BOS]', eos_token='[EOS]', pad_token='[PAD]') assert len(vocab) == 8
def test_tokenized_corpus_skip(mock_open): mock_open.side_effect = [StringIO(_FAKE_VOCAB_DATA), StringIO(_FAKE_CORPUS_DATA)] vocab = Vocab(vocab_path=None, unk_token='[UNK]', bos_token='[BOS]', eos_token='[EOS]', pad_token='[PAD]') dataset = TokenizedCorpus(corpus_path=None, vocab=vocab, seq_len=10) # Ignore first two sequences and fetch next data. dataset.skip(2) data = dataset.fetch() assert data['input'].tolist() == [0, 9, 4, 5, 12, 7, 4, 6, 1, 2] assert data['output'].tolist() == [9, 4, 5, 12, 7, 4, 6, 1, 2, 2]
def test_vocab_properties(mock_open): mock_open.return_value = StringIO(_FAKE_VOCAB_DATA) vocab = Vocab(vocab_path=None, unk_token='[UNK]', bos_token='[BOS]', eos_token='[EOS]', pad_token='[PAD]') # Get indices of special tokens by properties. assert vocab.unk_idx == 3 assert vocab.bos_idx == 0 assert vocab.eos_idx == 1 assert vocab.pad_idx == 2
def test_tokenized_corpus_where_and_assign(mock_open): mock_open.side_effect = [StringIO(_FAKE_VOCAB_DATA), StringIO(_FAKE_CORPUS_DATA), StringIO(_FAKE_CORPUS_DATA)] vocab = Vocab(vocab_path=None, unk_token='[UNK]', bos_token='[BOS]', eos_token='[EOS]', pad_token='[PAD]') dataset = TokenizedCorpus(corpus_path=None, vocab=vocab, seq_len=10) # Create another dataset with state of the original dataset. dataset.skip(2) where = dataset.where() dataset = TokenizedCorpus(corpus_path=None, vocab=vocab, seq_len=10) dataset.assign(where) # Since the original dataset ignored first two sequences, new dataset must # fetch from after the two sequences. data = dataset.fetch() assert data['input'].tolist() == [0, 9, 4, 5, 12, 7, 4, 6, 1, 2] assert data['output'].tolist() == [9, 4, 5, 12, 7, 4, 6, 1, 2, 2]
def test_vocab_contains(mock_open): mock_open.return_value = StringIO(_FAKE_VOCAB_DATA) vocab = Vocab(vocab_path=None, unk_token='[UNK]', bos_token='[BOS]', eos_token='[EOS]', pad_token='[PAD]') # The vocabulary must contain the belows. assert '[BOS]' in vocab assert '[EOS]' in vocab assert '[PAD]' in vocab assert '[UNK]' in vocab assert 'TOKEN#1' in vocab assert 'TOKEN#2' in vocab assert 'TOKEN#3' in vocab assert 'TOKEN#4' in vocab # These are not defined in the vocabulary. assert 'TOKEN#5' not in vocab assert 'TOKEN#6' not in vocab assert 'TOKEN#7' not in vocab assert 'TOKEN#8' not in vocab
def test_tokenized_corpus_fetch(mock_open): mock_open.side_effect = [StringIO(_FAKE_VOCAB_DATA), StringIO(_FAKE_CORPUS_DATA)] vocab = Vocab(vocab_path=None, unk_token='[UNK]', bos_token='[BOS]', eos_token='[EOS]', pad_token='[PAD]') dataset = TokenizedCorpus(corpus_path=None, vocab=vocab, seq_len=10) # Fetch single sequence from corpus. data = dataset.fetch() assert data['input'].tolist() == [0, 8, 10, 12, 7, 4, 6, 1, 2, 2] assert data['output'].tolist() == [8, 10, 12, 7, 4, 6, 1, 2, 2, 2] # Fetch batch sequences from the corpus. data = dataset.fetch(batch=2) assert data['input'].tolist() == [[0, 8, 11, 5, 12, 7, 4, 6, 1, 2], [0, 9, 4, 5, 12, 7, 4, 6, 1, 2]] assert data['output'].tolist() == [[8, 11, 5, 12, 7, 4, 6, 1, 2, 2], [9, 4, 5, 12, 7, 4, 6, 1, 2, 2]] # After getting all sequences from the corpus, dataset must fetch next data # from the first of the corpus. data = dataset.fetch() assert data['input'].tolist() == [0, 8, 10, 12, 7, 4, 6, 1, 2, 2] assert data['output'].tolist() == [8, 10, 12, 7, 4, 6, 1, 2, 2, 2]
def initialize(self): self.vocab = Vocab(vocab_path=self.vocab_path) self.criterion = nn.CrossEntropyLoss(reduction='none')
def initialize(self): self.vocab = Vocab(vocab_path=self.vocab_path) self.criterion = nn.CrossEntropyLoss(ignore_index=self.vocab.pad_idx, reduction='mean')
def initialize(self): self.vocab = Vocab(vocab_path=self.vocab_path) self.tokenizer = Tokenizer(vocab=self.vocab)