Exemple #1
0
 def test_invalid_arg(self):
     with self.assertRaises(
             ValueError,
             msg='Expected raise ValueError on invalid endianness.'):
         int_from_bytes(
             b64decode('abIAAAAAAAAAAA=='.encode('ascii')),
             'lobster')
Exemple #2
0
    def test_little_endian(self):
        self.assertEqual(
            34,
            int_from_bytes(b64decode('Ig=='.encode('ascii')), 'little'),
            msg="Int from bytes doesn't match expected value.")

        self.assertEqual(
            45673,
            int_from_bytes(
                b64decode('abIAAAAAAAAAAA=='.encode('ascii')),
                'little'),
            msg="Int from bytes doesn't match expected value.")

        self.assertEqual(
            56789876545678987654678987654567898765456789765456787654,
            int_from_bytes(
                b64decode('xvgpRbqRBXKjithyITo2aTz9FFn66VAC'.encode('ascii')),
                'little'),
            msg="Int from bytes doesn't match expected value.")
Exemple #3
0
    def test_big_endian(self):
        self.assertEqual(
            34,
            int_from_bytes(b64decode('Ig=='.encode('ascii')), 'big'),
            msg="Int from bytes doesn't match expected value.")

        self.assertEqual(
            45673,
            int_from_bytes(
                b64decode('AAAAAAAAAACyaQ=='.encode('ascii')),
                'big'),
            msg="Int from bytes doesn't match expected value.")

        self.assertEqual(
            56789876545678987654678987654567898765456789765456787654,
            int_from_bytes(
                b64decode('AlDp+lkU/TxpNjohctiKo3IFkbpFKfjG'.encode('ascii')),
                'big'),
            msg="Int from bytes doesn't match expected value.")
Exemple #4
0
def double_hash_encode_ngrams_non_singular(
        ngrams,  # type: Iterable[str]
        keys,  # type: Sequence[bytes]
        ks,  # type: Sequence[int]
        l,  # type: int
        encoding  # type: str
):
    # type: (...) -> bitarray.bitarray
    """ computes the double hash encoding of the n-grams with the given keys.

        The original construction of [Schnell2011]_ displays an abnormality for
        certain inputs:

            An n-gram can be encoded into just one bit irrespective of the number
            of k.

        Their construction goes as follows: the :math:`k` different indices
        :math:`g_i` of the Bloom filter for an n-gram
        :math:`x` are defined as:

        .. math:: g_{i}(x) = (h_1(x) + i h_2(x)) \\mod l

        with :math:`0 \\leq i < k` and :math:`l` is the length of the Bloom
        filter. If the value of the hash of :math:`x` of
        the second hash function is a multiple of :math:`l`, then

        .. math:: h_2(x) = 0 \\mod l

        and thus

        .. math:: g_i(x) = h_1(x) \\mod l,

        irrespective of the value :math:`i`. A discussion of this potential flaw
        can be found
        `here <https://github.com/n1analytics/clkhash/issues/33>`_.

        :param ngrams: list of n-grams to be encoded
        :param keys: tuple with (key_sha1, key_md5).
               That is, (hmac secret keys for sha1 as bytes, hmac secret keys for
               md5 as bytes)
        :param ks: ks[i] is k value to use for ngram[i]
        :param l: length of the output bitarray
        :param encoding: the encoding to use when turning the ngrams to bytes

        :return: bitarray of length l with the bits set which correspond to the
                 encoding of the ngrams
    """
    key_sha1, key_md5 = keys
    bf = bitarray(l)
    bf.setall(False)
    for m, k in zip(ngrams, ks):
        m_bytes = m.encode(encoding=encoding)

        sha1hm_bytes = hmac.new(key_sha1, m_bytes, sha1).digest()
        md5hm_bytes = hmac.new(key_md5, m_bytes, md5).digest()

        sha1hm = int_from_bytes(sha1hm_bytes, 'big') % l
        md5hm = int_from_bytes(md5hm_bytes, 'big') % l

        i = 0
        while md5hm == 0:
            md5hm_bytes = hmac.new(key_md5, m_bytes + chr(i).encode(),
                                   md5).digest()
            md5hm = int_from_bytes(md5hm_bytes, 'big') % l
            i += 1

        for i in range(k):
            gi = (sha1hm + i * md5hm) % l
            bf[gi] = True
    return bf