def _get_bucket(self, key):
     """
     Hashes a key string and returns the corresponding bucket.
     :param key: a key string
     :return: a bucket (list)
     """
     return self.buckets[shash(key) % self.bucket_count]
 def test_shash(self):
     # How many buckets we will use.
     bucket_count = 100
     # The test fails if the population standard deviation of the buckets is bigger than the product of the tolerance
     # and the mean of the buckets.
     tolerance = 0.05
     buckets = [0 for n in range(bucket_count)]
     # Load a lot of words.
     with open('../words.txt', 'r') as words:
         for line in words.readlines():
             buckets[shash(line) % bucket_count] += 1
     self.failIf(pstdev(buckets) > tolerance * mean(buckets))
def find(text, substr):
    """
    Attempts to find a substring in a string of text.
    :param text: the string of text
    :param substr: the substring
    :return: the index of the first character of the first appearance of the substring in the string or -1
    """
    text_size = len(text)
    substr_size = len(substr)
    if text_size < substr_size:
        return -1
    last_hash = shash(text[0:substr_size])
    substr_hash = shash(substr)
    if substr_hash == last_hash:
        if text[0:substr_size] == substr:
            return 0
    for i in range(1, text_size - substr_size + 1):
        last_hash = shash_roll(last_hash, substr_size, text[i - 1], text[substr_size + i - 1])
        if substr_hash == last_hash:
            if text[i:substr_size + i] == substr:
                return i
    return -1