def test_23bit_file_offset_too_small(self): trie = Trie() # The high bit of the child offset stores a lookahead barrier, so the # file has to be smaller than 8M, not 16. Python has a recursion limit # of 1000, so we can't really insert a 8M character long string. # Instead, insert one 130-character string where each char has 32k # 16bit result IDs. 129 isn't enough to overflow the offsets. results_32k = [j for j in range(32767)] for i in range(130): trie.insert('a' * i, results_32k) with self.assertRaisesRegex( OverflowError, "Trie child offset too large to store in 23 bits, set SEARCH_FILE_OFFSET_BYTES = 4 in your conf.py." ): trie.serialize( Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1)) # This should work trie.serialize( Serializer(file_offset_bytes=4, result_id_bytes=2, name_size_bytes=1))
def test_16bit_result_count(self): trie = Trie() for i in range(128): trie.insert("__init__", i) # It's __init_subclass__ (one underscore, not two), but here I want to # trigger the case of both a high amount of results and some children # as well. for i in [203, 215, 267]: trie.insert("__init__subclass__", i) for i in trie_type_sizes: with self.subTest(**i): serialized = trie.serialize(Serializer(**i)) self.compare( Deserializer(**i), serialized, """ __init__ [{}] subclass__ [203, 215, 267] """.format(', '.join([str(i) for i in range(128)]))) # Verify just the smallest and largest size, everything else # should fit in between if i['file_offset_bytes'] == 3 and i['result_id_bytes'] == 2: self.assertEqual(len(serialized), 377) elif i['file_offset_bytes'] == 4 and i['result_id_bytes'] == 4: self.assertEqual(len(serialized), 657) else: self.assertGreater(len(serialized), 377) self.assertLess(len(serialized), 657)
def test_unicode(self): trie = Trie() trie.insert("hýždě", 0) trie.insert("hárá", 1) serialized = trie.serialize( Serializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1)) self.compare( Deserializer(file_offset_bytes=3, result_id_bytes=2, name_size_bytes=1), serialized, """ h0xc3 0xbd 0xc5 | 0xbe | d0xc4 | 0x9b | [0] 0xa1 r0xc3 | 0xa1 | [1] """) self.assertEqual(len(serialized), 82)
def test_empty(self): trie = Trie() for i in trie_type_sizes: with self.subTest(**i): serialized = trie.serialize(Serializer(**i)) self.compare(Deserializer(**i), serialized, "") self.assertEqual(len(serialized), 6)
def test_24bit_result_id_too_small(self): trie = Trie() trie.insert("a", 16 * 1024 * 1024) with self.assertRaisesRegex( OverflowError, "Trie result ID too large to store in 24 bits, set SEARCH_RESULT_ID_BYTES = 4 in your conf.py." ): trie.serialize( Serializer(file_offset_bytes=3, result_id_bytes=3, name_size_bytes=1)) # This should work trie.serialize( Serializer(file_offset_bytes=3, result_id_bytes=4, name_size_bytes=1))
def test_single(self): trie = Trie() trie.insert("magnum", 1337) trie.insert("magnum", 21) serialized = trie.serialize() self.compare(serialized, """ magnum [1337, 21] """) self.assertEqual(len(serialized), 46)
def test_single(self): trie = Trie() trie.insert("magnum", 1337) trie.insert("magnum", 21) for i in trie_type_sizes: with self.subTest(**i): serialized = trie.serialize(Serializer(**i)) self.compare(Deserializer(**i), serialized, """ magnum [1337, 21] """) # Verify just the smallest and largest size, everything else # should fit in between if i['file_offset_bytes'] == 3 and i['result_id_bytes'] == 2: self.assertEqual(len(serialized), 46) elif i['file_offset_bytes'] == 4 and i['result_id_bytes'] == 4: self.assertEqual(len(serialized), 56) else: self.assertGreater(len(serialized), 46) self.assertLess(len(serialized), 56)
def test_unicode(self): trie = Trie() trie.insert("hýždě", 0) trie.insert("hárá", 1) serialized = trie.serialize() self.compare( serialized, """ h0xc3 0xbd 0xc5 | 0xbe | d0xc4 | 0x9b | [0] 0xa1 r0xc3 | 0xa1 | [1] """) self.assertEqual(len(serialized), 82)
def test_multiple(self): trie = Trie() trie.insert("math", 0) trie.insert("math::vector", 1, lookahead_barriers=[4]) trie.insert("vector", 1) trie.insert("math::range", 2) trie.insert("range", 2) trie.insert("math::min", 3) trie.insert("min", 3) trie.insert("math::max", 4) trie.insert("max", 4) trie.insert("math::minmax", 5) trie.insert("minmax", 5) trie.insert("math::vector::minmax", 6, lookahead_barriers=[4, 12]) trie.insert("vector::minmax", 6, lookahead_barriers=[6]) trie.insert("minmax", 6) trie.insert("math::vector::min", 7) trie.insert("vector::min", 7) trie.insert("min", 7) trie.insert("math::vector::max", 8) trie.insert("vector::max", 8) trie.insert("max", 8) trie.insert("math::range::min", 9, lookahead_barriers=[4, 11]) trie.insert("range::min", 9, lookahead_barriers=[5]) trie.insert("min", 9) trie.insert("math::range::max", 10) trie.insert("range::max", 10) trie.insert("max", 10) serialized = trie.serialize() self.compare( serialized, """ math [0] ||| :$ ||| :vector [1] ||| | :$ ||| | :min [7] ||| | | max [6] ||| | ax [8] ||| range [2] ||| | :$ ||| | :min [9] ||| | ax [10] ||| min [3] ||| || max [5] ||| |ax [4] ||x [4, 8, 10] |in [3, 7, 9] || max [5, 6] vector [1] | :$ | :min [7] | | max [6] | ax [8] range [2] | :$ | :min [9] | ax [10] """) self.assertEqual(len(serialized), 340)
def test_empty(self): trie = Trie() serialized = trie.serialize() self.compare(serialized, "") self.assertEqual(len(serialized), 6)
def test_multiple(self): trie = Trie() trie.insert("math", 0) trie.insert("math::vector", 1, lookahead_barriers=[4]) trie.insert("vector", 1) trie.insert("math::range", 2) trie.insert("range", 2) trie.insert("math::min", 3) trie.insert("min", 3) trie.insert("math::max", 4) trie.insert("max", 4) trie.insert("math::minmax", 5) trie.insert("minmax", 5) trie.insert("math::vector::minmax", 6, lookahead_barriers=[4, 12]) trie.insert("vector::minmax", 6, lookahead_barriers=[6]) trie.insert("minmax", 6) trie.insert("math::vector::min", 7) trie.insert("vector::min", 7) trie.insert("min", 7) trie.insert("math::vector::max", 8) trie.insert("vector::max", 8) trie.insert("max", 8) trie.insert("math::range::min", 9, lookahead_barriers=[4, 11]) trie.insert("range::min", 9, lookahead_barriers=[5]) trie.insert("min", 9) trie.insert("math::range::max", 10) trie.insert("range::max", 10) trie.insert("max", 10) for i in trie_type_sizes: with self.subTest(**i): serialized = trie.serialize(Serializer(**i)) self.compare( Deserializer(**i), serialized, """ math [0] ||| :$ ||| :vector [1] ||| | :$ ||| | :min [7] ||| | | max [6] ||| | ax [8] ||| range [2] ||| | :$ ||| | :min [9] ||| | ax [10] ||| min [3] ||| || max [5] ||| |ax [4] ||x [4, 8, 10] |in [3, 7, 9] || max [5, 6] vector [1] | :$ | :min [7] | | max [6] | ax [8] range [2] | :$ | :min [9] | ax [10] """) # Verify just the smallest and largest size, everything else # should fit in between if i['file_offset_bytes'] == 3 and i['result_id_bytes'] == 2: self.assertEqual(len(serialized), 340) elif i['file_offset_bytes'] == 4 and i['result_id_bytes'] == 4: self.assertEqual(len(serialized), 428) else: self.assertGreater(len(serialized), 340) self.assertLess(len(serialized), 428)