Ejemplo n.º 1
0
 def test_two_word_seq(self):
     data_source = getStream(self.test_words_long)
     ts = split_corpus_dataset.DomainAdaptationSplit(
         data_source, self.vocab, 2, 0.5)
     tokens_strings = list(iter(ts))
     expectation = [(torch.LongTensor([0, 1]), torch.LongTensor([2]))]
     self.assertEqual(tokens_strings, expectation)
Ejemplo n.º 2
0
 def test_single_word(self):
     data_source = getStream(self.test_words_short)
     ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1)
     tokens_string = next(iter(ts))
     expectation = (torch.LongTensor([0]), torch.LongTensor([1])
                    )  # input, target
     self.assertEqual(tokens_string, expectation)
Ejemplo n.º 3
0
 def test_two_word_seq_long(self):
     data_source = getStream(self.test_words_long)
     ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 2)
     tokens_strings = list(iter(ts))
     expectation = [(torch.LongTensor([0, 1]), torch.LongTensor([1, 2])),
                    (torch.LongTensor([2, 0]), torch.LongTensor([0, 0]))]
     self.assertEqual(tokens_strings, expectation)
Ejemplo n.º 4
0
 def test_single_word_retrieval(self):
     data_source = getStream(self.test_words_short)
     ts = split_corpus_dataset.DomainAdaptationSplit(data_source,
                                                     self.vocab,
                                                     1,
                                                     end_portion=0.5)
     words = list(ts.input_words())
     self.assertEqual(words, ['a'])  # we expect the input words
Ejemplo n.º 5
0
    def get_tokenized_splits(self, word_seqs, unroll):
        files = [getStream(seq) for seq in word_seqs]
        tss = [
            split_corpus_dataset.TokenizedSplit(f, self.vocab, unroll)
            for f in files
        ]

        return tss
Ejemplo n.º 6
0
 def test_single_word_seq(self):
     data_source = getStream(self.test_words_short)
     ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1)
     tokens_strings = list(iter(ts))
     expectation = [(torch.LongTensor([0]), torch.LongTensor([1])),
                    (torch.LongTensor([1]), torch.LongTensor([2])),
                    (torch.LongTensor([2]), torch.LongTensor([0]))]
     self.assertEqual(tokens_strings, expectation)
Ejemplo n.º 7
0
 def test_single_word(self):
     data_source = getStream(self.test_words_short)
     ts = split_corpus_dataset.DomainAdaptationSplitFFMultiTarget(
         data_source, self.vocab, 1, 1, end_portion=0.5)
     tokens_string = next(iter(ts))
     expectation = (torch.LongTensor([0]), torch.LongTensor([1])
                    )  # input, target
     self.assertEqual(tokens_string, expectation)
Ejemplo n.º 8
0
 def test_two_word_seq_long_mt(self):
     data_source = getStream(self.test_words_long)
     ts = split_corpus_dataset.DomainAdaptationSplitFFMultiTarget(
         data_source, self.vocab, 2, 2, end_portion=0.25)
     tokens_strings = list(iter(ts))
     expectation = [
         (torch.LongTensor([0, 1, 2]), torch.LongTensor([2, 0])),
     ]
     self.assertEqual(tokens_strings, expectation)
Ejemplo n.º 9
0
    def test_iter_ends(self):
        data_source = getStream(self.test_words_short)
        ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1)
        appender = ivec_appenders.CheatingIvecAppender(ts, self.ivec_eetor)
        appender = iter(appender)

        next(appender)
        next(appender)
        next(appender)

        self.assertRaises(StopIteration, next, appender)
Ejemplo n.º 10
0
    def test_single_data(self):
        data_source = getStream(self.test_words_short)
        ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1)
        appender = ivec_appenders.CheatingIvecAppender(ts, self.ivec_eetor)

        # cannot acces ts._tokens, it's an implementation
        tokens = [self.vocab[w] for w in self.test_words_short]

        expectation = self.ivec_eetor(" ".join(self.test_words_short[:-1]))
        seqs = next(iter(appender))
        first = seqs[2]

        self.assertEqual(first, expectation)
Ejemplo n.º 11
0
    def test_whole_seq(self):
        data_source = getStream(self.test_words_short)
        ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1)
        appender = ivec_appenders.HistoryIvecAppender(ts, self.ivec_eetor)

        # cannot acces ts._tokens, it's an implementation
        tokens = [self.vocab[w] for w in self.test_words_short]

        expectation = [
            self.ivec_eetor(" ".join(self.test_words_short[:0])),
            self.ivec_eetor(" ".join(self.test_words_short[:1])),
            self.ivec_eetor(" ".join(self.test_words_short[:2])),
        ]
        seqs = [x[2] for x in (iter(appender))]

        self.assertEqual(seqs, expectation)
Ejemplo n.º 12
0
    def test_whole_seq_with_next(self):
        data_source = getStream(self.test_words_short)
        ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1)
        appender = ivec_appenders.CheatingIvecAppender(ts, self.ivec_eetor)
        appender = iter(appender)

        # cannot acces ts._tokens, it's an implementation
        tokens = [self.vocab[w] for w in self.test_words_short]
        expectation = [
            self.ivec_eetor(" ".join(self.test_words_short[:-1])),
            self.ivec_eetor(" ".join(self.test_words_short[:-1])),
            self.ivec_eetor(" ".join(self.test_words_short[:-1]))
        ]

        seq0 = next(appender)[2]
        self.assertEqual(seq0, expectation[0])

        seq1 = next(appender)[2]
        self.assertEqual(seq1, expectation[1])

        seq2 = next(appender)[2]
        self.assertEqual(seq2, expectation[2])
Ejemplo n.º 13
0
 def test_single_word_len(self):
     data_source = getStream(self.test_words_short)
     ts = split_corpus_dataset.TokenizedSplitSingleTarget(
         data_source, self.vocab, 1)
     self.assertEqual(len(ts), len(self.test_words_short) - 1)
Ejemplo n.º 14
0
 def test_two_word_retrieval(self):
     data_source = getStream(self.test_words_short)
     ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 2)
     words = list(ts.input_words())
     self.assertEqual(words, ['a b'])  # we expect the input words
Ejemplo n.º 15
0
 def test_len_no_output(self):
     data_source = getStream(self.test_words_short)
     ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 5)
     self.assertEqual(len(ts), 0)
Ejemplo n.º 16
0
 def test_two_word_retrieval(self):
     data_source = getStream(self.test_words_long)
     ts = split_corpus_dataset.DomainAdaptationSplitFFMultiTarget(
         data_source, self.vocab, 2, 1, end_portion=0.5)
     words = list(ts.input_words())
     self.assertEqual(words, ['a a'])  # we expect the input words
Ejemplo n.º 17
0
 def test_single_word_len(self):
     data_source = getStream(self.test_words_short)
     ts = split_corpus_dataset.DomainAdaptationSplit(
         data_source, self.vocab, 1, 0.5)
     self.assertEqual(len(ts), 2)
Ejemplo n.º 18
0
 def test_len_no_output(self):
     data_source = getStream(self.test_words_short)
     ts = split_corpus_dataset.DomainAdaptationSplitFFMultiTarget(
         data_source, self.vocab, 5, 1, end_portion=0.5)
     self.assertEqual(len(ts), 0)