def test_single_word(self): data_source = getStream(self.test_words_short) ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1) tokens_string = next(iter(ts)) expectation = (torch.LongTensor([0]), torch.LongTensor([1]) ) # input, target self.assertEqual(tokens_string, expectation)
def test_two_word_seq_long(self): data_source = getStream(self.test_words_long) ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 2) tokens_strings = list(iter(ts)) expectation = [(torch.LongTensor([0, 1]), torch.LongTensor([1, 2])), (torch.LongTensor([2, 0]), torch.LongTensor([0, 0]))] self.assertEqual(tokens_strings, expectation)
def get_tokenized_splits(self, word_seqs, unroll): files = [getStream(seq) for seq in word_seqs] tss = [ split_corpus_dataset.TokenizedSplit(f, self.vocab, unroll) for f in files ] return tss
def test_single_word_seq(self): data_source = getStream(self.test_words_short) ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1) tokens_strings = list(iter(ts)) expectation = [(torch.LongTensor([0]), torch.LongTensor([1])), (torch.LongTensor([1]), torch.LongTensor([2])), (torch.LongTensor([2]), torch.LongTensor([0]))] self.assertEqual(tokens_strings, expectation)
def test_iter_ends(self): data_source = getStream(self.test_words_short) ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1) appender = ivec_appenders.CheatingIvecAppender(ts, self.ivec_eetor) appender = iter(appender) next(appender) next(appender) next(appender) self.assertRaises(StopIteration, next, appender)
def test_single_data(self): data_source = getStream(self.test_words_short) ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1) appender = ivec_appenders.CheatingIvecAppender(ts, self.ivec_eetor) # cannot acces ts._tokens, it's an implementation tokens = [self.vocab[w] for w in self.test_words_short] expectation = self.ivec_eetor(" ".join(self.test_words_short[:-1])) seqs = next(iter(appender)) first = seqs[2] self.assertEqual(first, expectation)
def test_whole_seq(self): data_source = getStream(self.test_words_short) ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1) appender = ivec_appenders.HistoryIvecAppender(ts, self.ivec_eetor) # cannot acces ts._tokens, it's an implementation tokens = [self.vocab[w] for w in self.test_words_short] expectation = [ self.ivec_eetor(" ".join(self.test_words_short[:0])), self.ivec_eetor(" ".join(self.test_words_short[:1])), self.ivec_eetor(" ".join(self.test_words_short[:2])), ] seqs = [x[2] for x in (iter(appender))] self.assertEqual(seqs, expectation)
def test_whole_seq_with_next(self): data_source = getStream(self.test_words_short) ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1) appender = ivec_appenders.CheatingIvecAppender(ts, self.ivec_eetor) appender = iter(appender) # cannot acces ts._tokens, it's an implementation tokens = [self.vocab[w] for w in self.test_words_short] expectation = [ self.ivec_eetor(" ".join(self.test_words_short[:-1])), self.ivec_eetor(" ".join(self.test_words_short[:-1])), self.ivec_eetor(" ".join(self.test_words_short[:-1])) ] seq0 = next(appender)[2] self.assertEqual(seq0, expectation[0]) seq1 = next(appender)[2] self.assertEqual(seq1, expectation[1]) seq2 = next(appender)[2] self.assertEqual(seq2, expectation[2])
def test_two_word_retrieval(self): data_source = getStream(self.test_words_short) ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 2) words = list(ts.input_words()) self.assertEqual(words, ['a b']) # we expect the input words
def test_len_no_output(self): data_source = getStream(self.test_words_short) ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 5) self.assertEqual(len(ts), 0)
def test_single_word_len(self): data_source = getStream(self.test_words_short) ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1) self.assertEqual(len(ts), len(self.test_words_short) - 1)