Ejemplo n.º 1
0
    def test_iblt_trim_strings_above_max_length(self):
        capacity = 10
        string_max_length = 4
        repetitions = 3
        seed = 0
        iblt_encoder = iblt_lib.IbltEncoder(
            capacity,
            string_max_length,
            seed=seed,
            drop_strings_above_max_length=False)
        input_strings_list = [
            '2019', 'seattle', 'heavy', 'hitters', 'क', '☺️', 'has space',
            'has, comma', '新年快乐', '😇☺️'
        ]
        input_strings = tf.constant(input_strings_list, dtype=tf.string)
        iblt_table = iblt_encoder.compute_iblt(input_strings)
        strings_with_frequency = self._get_decoded_results(
            iblt_table=iblt_table,
            capacity=capacity,
            string_max_length=string_max_length,
            repetitions=repetitions,
            seed=seed)

        # The IBLT automatically chooses a larger string_max_length if some space
        # is being wasted in the field encoding. For example if field is 2**31 - 1
        # and as such we can encode 3 bytes per int, then if we choose
        # string_max_length = 4, it will automatically update it to 2*3 = 6.
        expected_decoded_strings = [
            '2019', 'seattl', 'heavy', 'hitter', 'क', '☺️', 'has sp', 'has, c',
            '新年', '😇'
        ]
        self.assertCountEqual(expected_decoded_strings,
                              strings_with_frequency.keys())
Ejemplo n.º 2
0
    def test_iblt_tensorflow(self,
                             capacity=10,
                             field_size=iblt_lib.DEFAULT_FIELD_SIZE):
        string_max_length = 12
        repetitions = 3
        seed = 0

        iblt_encoder = iblt_lib.IbltEncoder(capacity,
                                            string_max_length,
                                            seed=seed,
                                            field_size=field_size)
        input_strings_list = [
            '2019', 'seattle', 'heavy', 'hitters', 'क', '☺️', 'has space',
            'has, comma', '新年快乐', '☺️😇'
        ]
        input_strings = tf.constant(input_strings_list, dtype=tf.string)
        iblt_table = iblt_encoder.compute_iblt(input_strings)
        strings_with_frequency = self._get_decoded_results(
            iblt_table=iblt_table,
            capacity=capacity,
            string_max_length=string_max_length,
            repetitions=repetitions,
            field_size=field_size,
            seed=seed)
        self.assertCountEqual(input_strings_list,
                              strings_with_frequency.keys())
Ejemplo n.º 3
0
    def test_iblt_with_coupled_hash_edges(self):
        capacity = 10
        string_max_length = 12
        repetitions = 3
        seed = 0
        hash_family = 'coupled'
        hash_family_params = {'rescale_factor': 4}

        iblt_encoder = iblt_lib.IbltEncoder(
            capacity,
            string_max_length,
            hash_family=hash_family,
            hash_family_params=hash_family_params,
            seed=seed)
        input_strings_list = [
            '2019', 'seattle', 'heavy', 'hitters', 'क', '☺️', 'has space',
            'has, comma', '新年快乐', '☺️😇'
        ]
        input_strings = tf.constant(input_strings_list, dtype=tf.string)
        iblt_table = iblt_encoder.compute_iblt(input_strings)
        strings_with_frequency = self._get_decoded_results(
            iblt_table=iblt_table,
            capacity=capacity,
            string_max_length=string_max_length,
            repetitions=repetitions,
            hash_family=hash_family,
            hash_family_params=hash_family_params,
            seed=seed)
        self.assertCountEqual(input_strings_list,
                              strings_with_frequency.keys())
Ejemplo n.º 4
0
 def test_decode_string_from_chunks(self):
     capacity = 10
     string_max_length = 7
     repetitions = 3
     seed = 0
     iblt_encoder = iblt_lib.IbltEncoder(capacity,
                                         string_max_length,
                                         seed=seed)
     input_strings_list = [
         '2019', 'seattle', 'mtv', 'heavy', 'hitters', '新年快乐', '☺️😇'
     ]
     input_strings = tf.constant(input_strings_list, dtype=tf.string)
     chunks, _ = iblt_encoder.compute_chunks(input_strings)
     iblt_table = np.zeros([
         repetitions, iblt_encoder.table_size, iblt_encoder.num_chunks + 2
     ])
     iblt_decoder = iblt_lib.IbltDecoder(
         iblt=iblt_table,
         capacity=capacity,
         string_max_length=string_max_length,
         repetitions=repetitions,
         seed=seed)
     for i in range(chunks.shape[0]):
         for j in range(len(bytes(input_strings_list[i], 'utf-8'))):
             if not tf.executing_eagerly():
                 decoded_string = self.evaluate(
                     iblt_decoder.decode_string_from_chunks(chunks[i]))
             else:
                 decoded_string = iblt_decoder.decode_string_from_chunks(
                     chunks[i]).numpy()
             if j < len(decoded_string):
                 self.assertEqual(
                     bytes(input_strings_list[i], 'utf-8')[j],
                     decoded_string[j])
Ejemplo n.º 5
0
 def test_iblt_input_checks(self,
                            input_strings_list,
                            input_values_list,
                            exception_raised,
                            input_values_dtype=tf.int64):
     capacity = 10
     string_max_length = 12
     seed = 0
     iblt_encoder = iblt_lib.IbltEncoder(
         capacity,
         string_max_length,
         seed=seed,
         drop_strings_above_max_length=False)
     input_strings = tf.constant(input_strings_list, dtype=tf.string)
     if input_values_list is not None:
         input_values = tf.constant(input_values_list,
                                    dtype=input_values_dtype)
     else:
         input_values = input_values_list
     with self.assertRaises(exception_raised):
         iblt_encoder.compute_iblt(input_strings, input_values)
Ejemplo n.º 6
0
 def test_iblt_with_counts(self):
     capacity = 10
     string_max_length = 12
     repetitions = 3
     seed = 0
     iblt_encoder = iblt_lib.IbltEncoder(
         capacity,
         string_max_length,
         seed=seed,
         drop_strings_above_max_length=False)
     input_map = {
         '201': 10,
         'seattle': -1,
         'heavy': iblt_lib.DEFAULT_FIELD_SIZE,
         'hitters': 2**16,
         'क': iblt_lib.DEFAULT_FIELD_SIZE - 1,
         '☺️': 0,
         '新年快乐': 100
     }
     input_strings_list, input_counts_list = zip(*input_map.items())
     input_strings = tf.constant(input_strings_list, dtype=tf.string)
     input_counts = tf.constant(input_counts_list, dtype=tf.int64)
     iblt_table = iblt_encoder.compute_iblt(input_strings,
                                            input_counts=input_counts)
     strings_with_frequency = self._get_decoded_results(
         iblt_table=iblt_table,
         capacity=capacity,
         string_max_length=string_max_length,
         repetitions=repetitions,
         seed=seed)
     input_map_mod_field_size = {
         string: count % iblt_lib.DEFAULT_FIELD_SIZE
         for string, count in input_map.items()
     }
     input_map_mod_field_size_non_zero = {
         string: count
         for string, count in input_map_mod_field_size.items() if count != 0
     }
     self.assertCountEqual(input_map_mod_field_size_non_zero.items(),
                           strings_with_frequency.items())
Ejemplo n.º 7
0
 def test_iblt_drop_strings_above_max_length(self):
     capacity = 10
     string_max_length = 3
     repetitions = 3
     seed = 0
     iblt_encoder = iblt_lib.IbltEncoder(capacity,
                                         string_max_length,
                                         seed=seed,
                                         drop_strings_above_max_length=True)
     input_strings_list = [
         '201', 'seattle', 'heavy', 'hitters', 'क', '☺️', 'has space',
         'has, comma', '新年快乐', '☺️😇'
     ]
     input_strings = tf.constant(input_strings_list, dtype=tf.string)
     iblt_table = iblt_encoder.compute_iblt(input_strings)
     strings_with_frequency = self._get_decoded_results(
         iblt_table=iblt_table,
         capacity=capacity,
         string_max_length=string_max_length,
         repetitions=repetitions,
         seed=seed)
     # ☺️, '新年快乐', '☺️😇' are filtered out as it takes more than 3 bytes to
     # encode.
     self.assertCountEqual(['201', 'क'], strings_with_frequency.keys())