Example #1
0
    def test_encode_and_decode_python_small_bytes_per_chunk_as_expected(
            self, string_max_length, dtype):
        max_chunk_value = 2**31 - 1
        input_strings = [
            '', 'some', 'unicodes', 'अ', 'क', 'æ', '☺️', '☺️', '☺️', '😇',
            ' has space ', 'has, comma'
        ]

        chunker = chunkers.UTF8Chunker(string_max_length,
                                       max_chunk_value=max_chunk_value,
                                       dtype=dtype)
        encoded_chunks, trimmed_input_strings = chunker.encode_tensorflow(
            tf.constant(input_strings))
        self.assertAllLess(encoded_chunks, max_chunk_value)

        decoded_strings = chunker.decode_python(encoded_chunks.numpy())

        # Set 'ignore' in `.decode()` to ignore decoding error because the strings
        # are trimmed when they are encoded, and the trimming might cut in the
        # middle of a multi-byte utf-8 character.
        decoded_strings_list = [
            decoded_strings[i].decode('utf-8', 'ignore')
            for i in range(decoded_strings.shape[0])
        ]

        trimmed_input_strings = trimmed_input_strings.numpy()

        self.assertCountEqual(input_strings, decoded_strings_list)
        self.assertCountEqual(trimmed_input_strings, decoded_strings)
Example #2
0
    def __init__(self,
                 capacity,
                 string_max_length,
                 *,
                 drop_strings_above_max_length=False,
                 seed=0,
                 repetitions=DEFAULT_REPETITIONS,
                 hash_family=None,
                 hash_family_params=None,
                 field_size=DEFAULT_FIELD_SIZE):
        """Initializes internal IBLT parameters.

    Args:
      capacity: Number of distinct strings that we expect to be inserted.
      string_max_length: Maximum length of a string that can be inserted.
      drop_strings_above_max_length: If True, strings above string_max_length
        will be dropped when constructing the IBLT. Defaults to False.
      seed: Integer seed for hash functions. Defaults to 0.
      repetitions: Number of repetitions in IBLT data structure (must be >= 3).
        Defaults to 3.
      hash_family: String specifying the hash family to use to construct IBLT.
        (options include coupled or random, default is chosen based on capacity)
      hash_family_params: A dict of parameters that the hash family hasher
        expects. (defaults are chosen based on capacity.)
      field_size: The field size for all values in IBLT. Defaults to 2**31 - 1.
    """
        self.string_max_length = string_max_length
        self.table_size, hash_family, hash_family_params = _internal_parameters(
            capacity, repetitions, hash_family, hash_family_params)
        self.repetitions = repetitions
        self.seed = seed
        self.field_size = field_size
        self.drop_strings_above_max_length = drop_strings_above_max_length
        self._dtype = tf.int64
        self.chunker = chunkers.UTF8Chunker(string_max_length,
                                            max_chunk_value=self.field_size,
                                            dtype=self._dtype)
        self.num_chunks = self.chunker.get_num_chunks()
        self.iblt_shape = (self.repetitions, self.table_size,
                           self.num_chunks + 2)
        if hash_family == _HASH_FAMILY_RANDOM:
            self.hyperedge_hasher = hyperedge_hashers.RandomHyperEdgeHasher(
                seed, self.table_size, repetitions, **hash_family_params)
        elif hash_family == _HASH_FAMILY_COUPLED:
            self.hyperedge_hasher = hyperedge_hashers.CoupledHyperEdgeHasher(
                seed, self.table_size, repetitions, **hash_family_params)
        else:
            raise NotImplementedError(
                "Hash family {} not supported in IBLTs.".format(hash_family))
Example #3
0
    def test_encode_and_decode_tensorflow_trim_strings_as_expected(
            self, string_max_length, input_strings, expected_decoded_strings):
        dtype = tf.int64

        chunker = chunkers.UTF8Chunker(string_max_length, dtype=dtype)
        encoded_chunks, trimmed_input_strings = chunker.encode_tensorflow(
            tf.constant(input_strings))
        decoded_strings = chunker.decode_tensorflow(encoded_chunks).numpy()
        decoded_strings_list = [
            decoded_strings[i].decode('utf-8', 'ignore')
            for i in range(decoded_strings.shape[0])
        ]

        trimmed_input_strings = trimmed_input_strings.numpy()

        self.assertCountEqual(expected_decoded_strings, decoded_strings_list)
        self.assertCountEqual(trimmed_input_strings, decoded_strings)
Example #4
0
    def test_encode_and_decode_python_trim_strings_as_expected(
            self, string_max_length, input_strings, expected_decoded_strings):
        dtype = tf.int64

        chunker = chunkers.UTF8Chunker(string_max_length, dtype=dtype)
        encoded_chunks, trimmed_input_strings = chunker.encode_tensorflow(
            tf.constant(input_strings))
        decoded_strings = chunker.decode_python(encoded_chunks.numpy())

        # Set 'ignore' in `.decode()` to ignore decoding error because the strings
        # are trimmed when they are encoded, and the trimming might cut in the
        # middle of a multi-byte utf-8 character.
        decoded_strings_list = [
            decoded_strings[i].decode('utf-8', 'ignore')
            for i in range(decoded_strings.shape[0])
        ]

        trimmed_input_strings = trimmed_input_strings.numpy()

        self.assertCountEqual(expected_decoded_strings, decoded_strings_list)
        self.assertCountEqual(trimmed_input_strings, decoded_strings)
Example #5
0
 def test_arguments_dtype_value_error(self, dtype):
     with self.assertRaises(ValueError):
         chunkers.UTF8Chunker(string_max_length=10, dtype=dtype)
Example #6
0
 def test_max_chunk_value_too_large_error(self):
     with self.assertRaises(ValueError):
         chunkers.UTF8Chunker(string_max_length=10,
                              max_chunk_value=2**33,
                              dtype=tf.int32)
Example #7
0
 def test_arguments_max_chunk_neg_value_error(self):
     with self.assertRaises(ValueError):
         chunkers.UTF8Chunker(string_max_length=10,
                              max_chunk_value=-1,
                              dtype=tf.int64)
Example #8
0
    def __init__(
        self,
        iblt: tf.Tensor,
        capacity: int,
        string_max_length: int,
        *,
        seed: int = 0,
        repetitions: int = DEFAULT_REPETITIONS,
        hash_family: Optional[str] = None,
        hash_family_params: Optional[Dict[str, Union[int, float]]] = None,
        field_size: int = DEFAULT_FIELD_SIZE,
    ):
        """Initializes the IBLT Decoder.

    The IBLT is a tensor of shape [repetitions, table_size, num_chunks + 2].
    Its value at index `(r, h, c)` corresponds to:

      - sum of chunk `c` of keys hashing to `h` in repetition `r` if
        `c < num_chunks`
      - sum of counts of keys hashing to `h` in repetition `r` if
        `c = num_chunks`
      - sum of checks of keys hashing to `h` in repetition `r` if
        `c = num_chunks + 1`.

    Since decoding is a destructive procedure, the __init__ function starts by
    making a copy of the iblt.

    Args:
      iblt: Tensor representing the IBLT computed by the IbltEncoder.
      capacity: Number of distinct strings that we expect to be inserted.
      string_max_length: Maximum length of a string that can be inserted.
      seed: Integer seed for hash functions. Defaults to 0.
      repetitions: Number of repetitions in IBLT data structure (must be >= 3).
        Defaults to 3.
      hash_family: A `str` specifying the hash family to use to construct IBLT.
        Options include coupled or random, default is chosen based on capacity.
      hash_family_params: An optional `dict` of parameters that the hash family
        hasher expects. Defaults are chosen based on capacity.
      field_size: The field size for all values in IBLT. Defaults to 2**31 - 1.
    """
        self._dtype = tf.int64
        self.iblt = iblt
        self.table_size, self.hash_family, self.hash_family_params = _internal_parameters(
            capacity, repetitions, hash_family, hash_family_params)
        self.field_size = field_size
        self.chunker = chunkers.UTF8Chunker(string_max_length,
                                            max_chunk_value=self.field_size,
                                            dtype=self._dtype)
        self.num_chunks = self.chunker.get_num_chunks()
        self.count = self.num_chunks
        self.check = self.num_chunks + 1
        self.repetitions = repetitions
        self.seed = seed
        self.iblt_shape = (self.repetitions, self.table_size,
                           self.num_chunks + 2)
        self.q = tf.queue.RandomShuffleQueue(capacity=self.table_size *
                                             self.repetitions,
                                             min_after_dequeue=0,
                                             dtypes=(self._dtype, self._dtype))
        if self.hash_family == _HASH_FAMILY_RANDOM:
            self.hyperedge_hasher = hyperedge_hashers.RandomHyperEdgeHasher(
                seed, self.table_size, repetitions, **self.hash_family_params)
        elif self.hash_family == _HASH_FAMILY_COUPLED:
            self.hyperedge_hasher = hyperedge_hashers.CoupledHyperEdgeHasher(
                seed, self.table_size, repetitions, **self.hash_family_params)
        else:
            raise NotImplementedError(
                f"Hash family {hash_family} not supported in IBLTs.")