Example #1
0
 def test_single_word_seq(self):
     data_source = getStream(self.test_words_short)
     ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1)
     tokens_strings = list(iter(ts))
     expectation = [(torch.LongTensor([0]), torch.LongTensor([1])),
                    (torch.LongTensor([1]), torch.LongTensor([2])),
                    (torch.LongTensor([2]), torch.LongTensor([0]))]
     self.assertEqual(tokens_strings, expectation)
Example #2
0
    def get_tokenized_splits(self, word_seqs, unroll):
        files = [getStream(seq) for seq in word_seqs]
        tss = [
            split_corpus_dataset.TokenizedSplit(f, self.vocab, unroll)
            for f in files
        ]

        return tss
Example #3
0
    def test_iter_ends(self):
        data_source = getStream(self.test_words_short)
        ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1)
        appender = ivec_appenders.CheatingIvecAppender(ts, self.ivec_eetor)
        appender = iter(appender)

        next(appender)
        next(appender)
        next(appender)

        self.assertRaises(StopIteration, next, appender)
Example #4
0
    def test_single_data(self):
        data_source = getStream(self.test_words_short)
        ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1)
        appender = ivec_appenders.CheatingIvecAppender(ts, self.ivec_eetor)

        # cannot acces ts._tokens, it's an implementation
        tokens = [self.vocab[w] for w in self.test_words_short]

        expectation = self.ivec_eetor(" ".join(self.test_words_short[:-1]))
        seqs = next(iter(appender))
        first = seqs[2]

        self.assertEqual(first, expectation)
Example #5
0
    def test_whole_seq(self):
        data_source = getStream(self.test_words_short)
        ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1)
        appender = ivec_appenders.HistoryIvecAppender(ts, self.ivec_eetor)

        # cannot acces ts._tokens, it's an implementation
        tokens = [self.vocab[w] for w in self.test_words_short]

        expectation = [
            self.ivec_eetor(" ".join(self.test_words_short[:0])),
            self.ivec_eetor(" ".join(self.test_words_short[:1])),
            self.ivec_eetor(" ".join(self.test_words_short[:2])),
        ]
        seqs = [x[2] for x in (iter(appender))]

        self.assertEqual(seqs, expectation)
Example #6
0
    def test_whole_seq_with_next(self):
        data_source = getStream(self.test_words_short)
        ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1)
        appender = ivec_appenders.CheatingIvecAppender(ts, self.ivec_eetor)
        appender = iter(appender)

        # cannot acces ts._tokens, it's an implementation
        tokens = [self.vocab[w] for w in self.test_words_short]
        expectation = [
            self.ivec_eetor(" ".join(self.test_words_short[:-1])),
            self.ivec_eetor(" ".join(self.test_words_short[:-1])),
            self.ivec_eetor(" ".join(self.test_words_short[:-1]))
        ]

        seq0 = next(appender)[2]
        self.assertEqual(seq0, expectation[0])

        seq1 = next(appender)[2]
        self.assertEqual(seq1, expectation[1])

        seq2 = next(appender)[2]
        self.assertEqual(seq2, expectation[2])
Example #7
0
 def test_two_word_retrieval(self):
     data_source = getStream(self.test_words_short)
     ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 2)
     words = list(ts.input_words())
     self.assertEqual(words, ['a b'])  # we expect the input words
Example #8
0
 def test_len_no_output(self):
     data_source = getStream(self.test_words_short)
     ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 5)
     self.assertEqual(len(ts), 0)
Example #9
0
 def test_single_word_len(self):
     data_source = getStream(self.test_words_short)
     ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1)
     self.assertEqual(len(ts), len(self.test_words_short) - 1)