def test_characters(self): rw = final.RandomWriter(2, final.Tokenization.character) rw.train_iterable("What a piece of work is man! how noble in reason! how infinite in faculty! in form and moving how express and admirable! " "in action how like an angel! in apprehension how like a god! the beauty of the world, the paragon of animals!") self.assertIsInstance(next(iter(rw.generate())), str) self.assertContainsSequence(rw.generate(), "worm") self.assertNotContainsSequence(rw.generate(), "mals ")
def test_numeric_sequence_notin(self): rw = final.RandomWriter(2) rw.train_iterable((1, 2, 3, 4, 5, 5, 5, 4, 3, 2, 1, 2, 4, 5)) self.assertNotContainsSequence(rw.generate(), [5, 5, 3]) self.assertNotContainsSequence(rw.generate(), [1, 2, 5]) self.assertNotContainsSequence(rw.generate(), [4, 2]) self.assertNotContainsSequence(rw.generate(), ["5"])
def test_words(self): rw = final.RandomWriter(1, final.Tokenization.word) rw.train_iterable("the given iterable must contain the sequence the") self.assertNotContainsSequence(rw.generate(), "the the".split(" ")) self.assertNotContainsSequence(rw.generate(), "the iterable".split(" ")) self.assertContainsSequence(rw.generate(), "iterable must contain".split(" "), times=10) self.assertContainsSequence(rw.generate(), "the sequence".split(" "), times=200)
def test_numeric_sequence(self): rw = final.RandomWriter(2) rw.train_iterable((1,2,3,4,5,5,4,3,2,1)) self.assertNotContainsSequence(rw.generate(), [5,5,3]) self.assertNotContainsSequence(rw.generate(), [1,2,5]) self.assertNotContainsSequence(rw.generate(), [2,4]) self.assertContainsSequence(rw.generate(), [3,4,5,5,4,3,2], times=10)
def test_bytes_nonutf8(self): rw = final.RandomWriter(2, final.Tokenization.byte) rw.train_iterable(b"What a piece of work is man! how noble in reason! how infinite in faculty! in form and moving how express and admirable! " b"in action how like an angel! in apprehension how like a god!\xff\xfe the beauty of the world, the paragon of animals!") self.assertTrue(isinstance(next(iter(rw.generate())), (int, bytes))) self.assertNotContainsSequence(rw.generate(), b"mals ") self.assertContainsSequence(rw.generate(), b"worm") self.assertContainsSequence(rw.generate(), b"!\xff\xfe")
def test_generate_count(self): rw = final.RandomWriter(2, final.Tokenization.character) rw.train_iterable( "What a piece of work is man! how noble in reason! how infinite in faculty! in form and moving how express and admirable! " "in action how like an angel! in apprehension how like a god! the beauty of the world, the paragon of animals!" ) generated = len(list(itertools.islice(rw.generate(), 10000))) self.assertEqual(generated, 10000)
def test_train_iterator(self): rw = final.RandomWriter(1) rw.train_iterable(iter((1,2,3,4,5,5,5,4,3,2,1,2,4,5))) self.assertIsInstance(next(iter(rw.generate())), int) self.assertContainsSequence(rw.generate(), [3,4,5,5,4,3,2], times=10) self.assertContainsSequence(rw.generate(), [3,4,5,5,5,5,4,3,2]) self.assertContainsSequence(rw.generate(), [5,5,5,5,5]) self.assertContainsSequence(rw.generate(), [3,2,1,2,4,5,5,4]) self.assertContainsSequence(rw.generate(), [3,2,1,2,3,4,5,5,4])
def test_generate_file_size(self): rw = final.RandomWriter(1, final.Tokenization.character) rw.train_iterable("abcaea") with nonexistant_filename() as fn: rw.generate_file(fn, self.DEFAULT_LENGTH) with open(fn, "rt") as fi: content = fi.read() self.assertGreaterEqual(len(content), self.DEFAULT_LENGTH) self.assertLessEqual(len(content), self.DEFAULT_LENGTH + 2)
def test_generate_file2(self): rw = final.RandomWriter(1, final.Tokenization.word) rw.train_iterable("a the word the") with nonexistant_filename() as fn: rw.generate_file(fn, self.DEFAULT_LENGTH) with open(fn, "rt") as fi: content = fi.read() self.assertNotContainsSequence(content, "the a") self.assertContainsSequence(content, "the word", times=100)
def test_generate_file3(self): rw = final.RandomWriter(2, final.Tokenization.none) rw.train_iterable((1, 2, 3, 4, 5, 5, 4, 3, 2, 1)) with nonexistant_filename() as fn: rw.generate_file(fn, self.DEFAULT_LENGTH) with open(fn, "rt") as fi: content = fi.read() self.assertNotContainsSequence(content, "5 5 3") self.assertNotContainsSequence(content, "1 2 5") self.assertContainsSequence(content, "3 4 5 5 4 3 2", times=100)
def test_multiple_generators(self): rw = final.RandomWriter(2, final.Tokenization.character) rw.train_iterable("What a piece of work is man! how noble in reason! how infinite in faculty! in form and moving how express and admirable! " "in action how like an angel! in apprehension how like a god! the beauty of the world, the paragon of animals!") self.assertIsInstance(next(iter(rw.generate())), str) g1 = rw.generate() g2 = rw.generate() ss = zip(*[(next(g1), next(g2)) for _ in range(self.DEFAULT_LENGTH)]) for s in ss: self.assertContainsSequence(s, "worm") self.assertNotContainsSequence(s, "mals ")
def test_save_load_pickle(self): rw = final.RandomWriter(1, final.Tokenization.character) rw.train_iterable("abcaea") with nonexistant_filename() as fn: rw.save_pickle(fn) rw2 = final.RandomWriter.load_pickle(fn) self.assertNotContainsSequence(rw.generate(), "ac") self.assertNotContainsSequence(rw.generate(), "aa") self.assertNotContainsSequence(rw.generate(), "ce") self.assertContainsSequence(rw.generate(), "abc", times=100) self.assertContainsSequence(rw.generate(), "aeaeab", times=100)
def test_generate_file1(self): rw = final.RandomWriter(1, final.Tokenization.character) rw.train_iterable("abcaea") with nonexistant_filename() as fn: rw.generate_file(fn, self.DEFAULT_LENGTH) with open(fn, "rt") as fi: content = fi.read() self.assertNotContainsSequence(content, "ac") self.assertNotContainsSequence(content, "aa") self.assertNotContainsSequence(content, "ce") self.assertContainsSequence(content, "abc", times=100) self.assertContainsSequence(content, "aeaeab", times=100)
def test_generate_file4(self): rw = final.RandomWriter(1, final.Tokenization.byte) # a b c a e a rw.train_iterable(b"\xfe\xff\x02\xfe\x03\xfe") with nonexistant_filename() as fn: rw.generate_file(fn, self.DEFAULT_LENGTH) with open(fn, "rb") as fi: content = fi.read() self.assertNotContainsSequence(content, b"\xfe\x02") self.assertNotContainsSequence(content, b"\xfe\xfe") self.assertNotContainsSequence(content, b"\x02\x03") self.assertContainsSequence(content, b"\xfe\xff\x02", times=100) self.assertContainsSequence(content, b"\xfe\x03\xfe\x03\xfe\xff", times=100)
def test_train_url_utf8(self): rw = final.RandomWriter(5, final.Tokenization.character) rw.train_url("http://www.singingwizard.org/stuff/utf8test.txt") self.assertContainsSequence(rw.generate(), "ajtób", length=100000)
def test_train_url_word(self): rw = final.RandomWriter(1, final.Tokenization.word) rw.train_url("http://www.singingwizard.org/stuff/pg24132.txt") self.assertContainsSequence(rw.generate(), "she had".split(), length=100000)
def test_train_url_bytes(self): rw = final.RandomWriter(4, final.Tokenization.byte) rw.train_url("http://www.singingwizard.org/stuff/pg24132.txt") self.assertContainsSequence(rw.generate(), b"ad di", length=300000)
def test_bytes_nonutf8_file(self): rw = final.RandomWriter(1, final.Tokenization.byte) rw.train_url("http://www.singingwizard.org/stuff/nonutf8.txt") self.assertTrue(isinstance(next(iter(rw.generate())), (int, bytes))) self.assertContainsSequence(rw.generate(), b"\xfe\xff\xfe") self.assertNotContainsSequence(rw.generate(), b"\x02\xfe")