def test_invalid_arg(self): with self.assertRaises( ValueError, msg='Expected raise ValueError on invalid endianness.'): int_from_bytes( b64decode('abIAAAAAAAAAAA=='.encode('ascii')), 'lobster')
def test_little_endian(self): self.assertEqual( 34, int_from_bytes(b64decode('Ig=='.encode('ascii')), 'little'), msg="Int from bytes doesn't match expected value.") self.assertEqual( 45673, int_from_bytes( b64decode('abIAAAAAAAAAAA=='.encode('ascii')), 'little'), msg="Int from bytes doesn't match expected value.") self.assertEqual( 56789876545678987654678987654567898765456789765456787654, int_from_bytes( b64decode('xvgpRbqRBXKjithyITo2aTz9FFn66VAC'.encode('ascii')), 'little'), msg="Int from bytes doesn't match expected value.")
def test_big_endian(self): self.assertEqual( 34, int_from_bytes(b64decode('Ig=='.encode('ascii')), 'big'), msg="Int from bytes doesn't match expected value.") self.assertEqual( 45673, int_from_bytes( b64decode('AAAAAAAAAACyaQ=='.encode('ascii')), 'big'), msg="Int from bytes doesn't match expected value.") self.assertEqual( 56789876545678987654678987654567898765456789765456787654, int_from_bytes( b64decode('AlDp+lkU/TxpNjohctiKo3IFkbpFKfjG'.encode('ascii')), 'big'), msg="Int from bytes doesn't match expected value.")
def double_hash_encode_ngrams_non_singular( ngrams, # type: Iterable[str] keys, # type: Sequence[bytes] ks, # type: Sequence[int] l, # type: int encoding # type: str ): # type: (...) -> bitarray.bitarray """ computes the double hash encoding of the n-grams with the given keys. The original construction of [Schnell2011]_ displays an abnormality for certain inputs: An n-gram can be encoded into just one bit irrespective of the number of k. Their construction goes as follows: the :math:`k` different indices :math:`g_i` of the Bloom filter for an n-gram :math:`x` are defined as: .. math:: g_{i}(x) = (h_1(x) + i h_2(x)) \\mod l with :math:`0 \\leq i < k` and :math:`l` is the length of the Bloom filter. If the value of the hash of :math:`x` of the second hash function is a multiple of :math:`l`, then .. math:: h_2(x) = 0 \\mod l and thus .. math:: g_i(x) = h_1(x) \\mod l, irrespective of the value :math:`i`. A discussion of this potential flaw can be found `here <https://github.com/n1analytics/clkhash/issues/33>`_. :param ngrams: list of n-grams to be encoded :param keys: tuple with (key_sha1, key_md5). That is, (hmac secret keys for sha1 as bytes, hmac secret keys for md5 as bytes) :param ks: ks[i] is k value to use for ngram[i] :param l: length of the output bitarray :param encoding: the encoding to use when turning the ngrams to bytes :return: bitarray of length l with the bits set which correspond to the encoding of the ngrams """ key_sha1, key_md5 = keys bf = bitarray(l) bf.setall(False) for m, k in zip(ngrams, ks): m_bytes = m.encode(encoding=encoding) sha1hm_bytes = hmac.new(key_sha1, m_bytes, sha1).digest() md5hm_bytes = hmac.new(key_md5, m_bytes, md5).digest() sha1hm = int_from_bytes(sha1hm_bytes, 'big') % l md5hm = int_from_bytes(md5hm_bytes, 'big') % l i = 0 while md5hm == 0: md5hm_bytes = hmac.new(key_md5, m_bytes + chr(i).encode(), md5).digest() md5hm = int_from_bytes(md5hm_bytes, 'big') % l i += 1 for i in range(k): gi = (sha1hm + i * md5hm) % l bf[gi] = True return bf