Ejemplo n.º 1
0
def test_vocab_getitem(mock_open):
    mock_open.return_value = StringIO(_FAKE_VOCAB_DATA)
    vocab = Vocab(vocab_path=None,
                  unk_token='[UNK]',
                  bos_token='[BOS]',
                  eos_token='[EOS]',
                  pad_token='[PAD]')

    # Get index by token.
    assert vocab['[BOS]'] == 0
    assert vocab['[EOS]'] == 1
    assert vocab['[PAD]'] == 2
    assert vocab['[UNK]'] == 3
    assert vocab['TOKEN#1'] == 4
    assert vocab['TOKEN#2'] == 5
    assert vocab['TOKEN#3'] == 6
    assert vocab['TOKEN#4'] == 7

    # Get token by index.
    assert vocab[0] == '[BOS]'
    assert vocab[1] == '[EOS]'
    assert vocab[2] == '[PAD]'
    assert vocab[3] == '[UNK]'
    assert vocab[4] == 'TOKEN#1'
    assert vocab[5] == 'TOKEN#2'
    assert vocab[6] == 'TOKEN#3'
    assert vocab[7] == 'TOKEN#4'
Ejemplo n.º 2
0
def test_tokenizer_decode(mock_open):
    mock_open.return_value = StringIO(_FAKE_VOCAB_DATA)

    vocab = Vocab(vocab_path=None, unk_token='[UNK]')
    tokenizer = Tokenizer(vocab)

    assert (tokenizer.decode(['he', '##llo', 'wo', '##r', '##l', '##d'])
            == 'hello world')
Ejemplo n.º 3
0
def test_vocab_len(mock_open):
    mock_open.return_value = StringIO(_FAKE_VOCAB_DATA)
    vocab = Vocab(vocab_path=None,
                  unk_token='[UNK]',
                  bos_token='[BOS]',
                  eos_token='[EOS]',
                  pad_token='[PAD]')

    assert len(vocab) == 8
Ejemplo n.º 4
0
def test_tokenized_corpus_skip(mock_open):
    mock_open.side_effect = [StringIO(_FAKE_VOCAB_DATA),
                             StringIO(_FAKE_CORPUS_DATA)]
    vocab = Vocab(vocab_path=None, unk_token='[UNK]', bos_token='[BOS]',
                  eos_token='[EOS]', pad_token='[PAD]')
    dataset = TokenizedCorpus(corpus_path=None, vocab=vocab, seq_len=10)

    # Ignore first two sequences and fetch next data.
    dataset.skip(2)
    data = dataset.fetch()
    assert data['input'].tolist() == [0, 9, 4, 5, 12, 7, 4, 6, 1, 2]
    assert data['output'].tolist() == [9, 4, 5, 12, 7, 4, 6, 1, 2, 2]
Ejemplo n.º 5
0
def test_vocab_properties(mock_open):
    mock_open.return_value = StringIO(_FAKE_VOCAB_DATA)
    vocab = Vocab(vocab_path=None,
                  unk_token='[UNK]',
                  bos_token='[BOS]',
                  eos_token='[EOS]',
                  pad_token='[PAD]')

    # Get indices of special tokens by properties.
    assert vocab.unk_idx == 3
    assert vocab.bos_idx == 0
    assert vocab.eos_idx == 1
    assert vocab.pad_idx == 2
Ejemplo n.º 6
0
def test_tokenized_corpus_where_and_assign(mock_open):
    mock_open.side_effect = [StringIO(_FAKE_VOCAB_DATA),
                             StringIO(_FAKE_CORPUS_DATA),
                             StringIO(_FAKE_CORPUS_DATA)]
    vocab = Vocab(vocab_path=None, unk_token='[UNK]', bos_token='[BOS]',
                  eos_token='[EOS]', pad_token='[PAD]')
    dataset = TokenizedCorpus(corpus_path=None, vocab=vocab, seq_len=10)

    # Create another dataset with state of the original dataset.
    dataset.skip(2)
    where = dataset.where()

    dataset = TokenizedCorpus(corpus_path=None, vocab=vocab, seq_len=10)
    dataset.assign(where)

    # Since the original dataset ignored first two sequences, new dataset must
    # fetch from after the two sequences.
    data = dataset.fetch()
    assert data['input'].tolist() == [0, 9, 4, 5, 12, 7, 4, 6, 1, 2]
    assert data['output'].tolist() == [9, 4, 5, 12, 7, 4, 6, 1, 2, 2]
Ejemplo n.º 7
0
def test_vocab_contains(mock_open):
    mock_open.return_value = StringIO(_FAKE_VOCAB_DATA)
    vocab = Vocab(vocab_path=None,
                  unk_token='[UNK]',
                  bos_token='[BOS]',
                  eos_token='[EOS]',
                  pad_token='[PAD]')

    # The vocabulary must contain the belows.
    assert '[BOS]' in vocab
    assert '[EOS]' in vocab
    assert '[PAD]' in vocab
    assert '[UNK]' in vocab
    assert 'TOKEN#1' in vocab
    assert 'TOKEN#2' in vocab
    assert 'TOKEN#3' in vocab
    assert 'TOKEN#4' in vocab

    # These are not defined in the vocabulary.
    assert 'TOKEN#5' not in vocab
    assert 'TOKEN#6' not in vocab
    assert 'TOKEN#7' not in vocab
    assert 'TOKEN#8' not in vocab
Ejemplo n.º 8
0
def test_tokenized_corpus_fetch(mock_open):
    mock_open.side_effect = [StringIO(_FAKE_VOCAB_DATA),
                             StringIO(_FAKE_CORPUS_DATA)]
    vocab = Vocab(vocab_path=None, unk_token='[UNK]', bos_token='[BOS]',
                  eos_token='[EOS]', pad_token='[PAD]')
    dataset = TokenizedCorpus(corpus_path=None, vocab=vocab, seq_len=10)

    # Fetch single sequence from corpus.
    data = dataset.fetch()
    assert data['input'].tolist() == [0, 8, 10, 12, 7, 4, 6, 1, 2, 2]
    assert data['output'].tolist() == [8, 10, 12, 7, 4, 6, 1, 2, 2, 2]

    # Fetch batch sequences from the corpus.
    data = dataset.fetch(batch=2)
    assert data['input'].tolist() == [[0, 8, 11, 5, 12, 7, 4, 6, 1, 2],
                                      [0, 9, 4, 5, 12, 7, 4, 6, 1, 2]]
    assert data['output'].tolist() == [[8, 11, 5, 12, 7, 4, 6, 1, 2, 2],
                                       [9, 4, 5, 12, 7, 4, 6, 1, 2, 2]]

    # After getting all sequences from the corpus, dataset must fetch next data
    # from the first of the corpus.
    data = dataset.fetch()
    assert data['input'].tolist() == [0, 8, 10, 12, 7, 4, 6, 1, 2, 2]
    assert data['output'].tolist() == [8, 10, 12, 7, 4, 6, 1, 2, 2, 2]
Ejemplo n.º 9
0
 def initialize(self):
     self.vocab = Vocab(vocab_path=self.vocab_path)
     self.criterion = nn.CrossEntropyLoss(reduction='none')
Ejemplo n.º 10
0
 def initialize(self):
     self.vocab = Vocab(vocab_path=self.vocab_path)
     self.criterion = nn.CrossEntropyLoss(ignore_index=self.vocab.pad_idx,
                                          reduction='mean')
Ejemplo n.º 11
0
 def initialize(self):
     self.vocab = Vocab(vocab_path=self.vocab_path)
     self.tokenizer = Tokenizer(vocab=self.vocab)