class BloomFilter(): def __init__(self, *hash_functions, **kwds): """ @param max_size: In bytes """ self.__bitset = BitSet() if not hash_functions: hash_functions = (object_hash, object_repr_hash, object_str_hash, object_id) self.__hash_functions = hash_functions max_size = kwds.get("max_size", 1024) itemsize = self.__bitset.itemsize self.__max_bits = int(itemsize * ceil(float(max_size) / itemsize)) << 3 def extend(self, values): map(self.add, values) def __get_bit_indexes_of(self, value): return (hf(value) % self.__max_bits for hf in self.__hash_functions) def add(self, value): indexes_to_set = self.__get_bit_indexes_of(value) self.__bitset.set_indexes(indexes_to_set) def __contains__(self, value): indexes_to_get = self.__get_bit_indexes_of(value) bits = (index in self.__bitset for index in indexes_to_get) return all(bits) def __len__(self): return len(self.__bitset) >> 3 def __repr__(self): return "BloomFilter (%s bytes): %s" % (len(self), self.__bitset)
def find_matches(self, inverted_index): bitset = BitSet() bitset.set_indexes(inverted_index.get_doc_ids_with(self._field_id, self._value)) return bitset