def __init__(self, capacity, error_rate=0.001): """Implements a space-efficient probabilistic data structure capacity this BloomFilter must be able to store at least *capacity* elements while maintaining no more than *error_rate* chance of false positives error_rate the error_rate of the filter returning false positives. This determines the filters capacity. Inserting more than capacity elements greatly increases the chance of false positives. >>> b = BloomFilter(capacity=100000, error_rate=0.001) >>> b.add("test") False >>> "test" in b True """ if not (0 < error_rate < 1): raise ValueError("Error_Rate must be between 0 and 1.") if not capacity > 0: raise ValueError("Capacity must be > 0") # given M = num_bits, k = num_slices, P = error_rate, n = capacity # k = log2(1/P) # solving for m = bits_per_slice # n ~= M * ((ln(2) ** 2) / abs(ln(P))) # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P))) # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2)) num_slices = int(math.ceil(math.log(1.0 / error_rate, 2))) bits_per_slice = int(math.ceil((capacity * abs(math.log(error_rate))) / (num_slices * (math.log(2) ** 2)))) self._setup(error_rate, num_slices, bits_per_slice, capacity, 0) self.bitarray = bitarray.bitarray(self.num_bits, endian="little") self.bitarray.setall(False)
def fromfile(cls, f, n=-1): """Read a bloom filter from file-object `f' serialized with ``BloomFilter.tofile''. If `n' > 0 read only so many bytes.""" headerlen = calcsize(cls.FILE_FMT) if 0 < n < headerlen: raise ValueError("n too small!") filter = cls(1) # Bogus instantiation, we will `_setup'. filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen))) filter.bitarray = bitarray.bitarray(endian="little") if n > 0: ( filter.bitarray.frombytes(f.read(n - headerlen)) if is_string_io(f) else filter.bitarray.fromfile(f, n - headerlen) ) else: (filter.bitarray.frombytes(f.read()) if is_string_io(f) else filter.bitarray.fromfile(f)) if filter.num_bits != filter.bitarray.length() and ( filter.num_bits + (8 - filter.num_bits % 8) != filter.bitarray.length() ): raise ValueError("Bit length mismatch!") return filter