class BloomFilter(): def __init__(self, *hash_functions, **kwds): """ @param max_size: In bytes """ self.__bitset = BitSet() if not hash_functions: hash_functions = (object_hash, object_repr_hash, object_str_hash, object_id) self.__hash_functions = hash_functions max_size = kwds.get("max_size", 1024) itemsize = self.__bitset.itemsize self.__max_bits = int(itemsize * ceil(float(max_size) / itemsize)) << 3 def extend(self, values): map(self.add, values) def __get_bit_indexes_of(self, value): return (hf(value) % self.__max_bits for hf in self.__hash_functions) def add(self, value): indexes_to_set = self.__get_bit_indexes_of(value) self.__bitset.set_indexes(indexes_to_set) def __contains__(self, value): indexes_to_get = self.__get_bit_indexes_of(value) bits = (index in self.__bitset for index in indexes_to_get) return all(bits) def __len__(self): return len(self.__bitset) >> 3 def __repr__(self): return "BloomFilter (%s bytes): %s" % (len(self), self.__bitset)
def __init__(self, *hash_functions, **kwds): """ @param max_size: In bytes """ self.__bitset = BitSet() if not hash_functions: hash_functions = (object_hash, object_repr_hash, object_str_hash, object_id) self.__hash_functions = hash_functions max_size = kwds.get("max_size", 1024) itemsize = self.__bitset.itemsize self.__max_bits = int(itemsize * ceil(float(max_size) / itemsize)) << 3
def do_assignment(repeats=10, size=5000, m=16): med = 0 maximals = [] print("Number of non-dominated BitStrings: ") for _ in range(repeats): bset = BitSet(size, m) bset.find_dominated() maximal = len(bset.no_dom) maximals.append(maximal) print(maximal, sep=' ', end=' ', flush=True) print() med = median(maximals) print("Approximation of E[M_n]: ", med) results = "Bits in a string: {}\nSet size: {}\nNumber of sets: {}\nApproximation of E[M_n]: {}\n----\n" with open("generated.txt", "a+") as file: file.write(results.format(m, size, repeats, med))
def load_name_index(cls, bits): result = {} # string -> int ver = bits.read_int32() # 0..3 Version sig = bits.read_int32() # 4..7 Signature age = bits.read_int32() # 8..11 Age guid = bits.read_guid() # 12..27 GUID #if ver != 20000404: # raise PdbDebugException('Unsupported PDB Stream version {%u' % ver) # Read string buffer. buf = bits.read_int32() # 28..31 Bytes of Strings beg = bits.position nxt = bits.position + buf bits.position = nxt # Read map index. cnt = bits.read_int32() # n+0..3 hash size. mx = bits.read_int32() # n+4..7 maximum ni. present = BitSet(bits) deleted = BitSet(bits) if not deleted.is_empty(): raise PdbDebugException('Unsupported PDB deleted bitset is not empty.') j = 0; for i in range(0, mx): if present.is_set(i): ns = bits.read_int32() ni = bits.read_int32() saved = bits.position bits.position = beg + ns name = bits.read_cstring() bits.position = saved result[name.upper()] = ni j += 1 if j != cnt: raise PdbDebugException('Count mismatch. (%u != %u)' % (j, cnt)) return (result, ver, sig, age, guid)
def load_name_index(cls, bits): result = {} # string -> int ver = bits.read_int32() # 0..3 Version sig = bits.read_int32() # 4..7 Signature age = bits.read_int32() # 8..11 Age guid = bits.read_guid() # 12..27 GUID #if ver != 20000404: # raise PdbDebugException('Unsupported PDB Stream version {%u' % ver) # Read string buffer. buf = bits.read_int32() # 28..31 Bytes of Strings beg = bits.position nxt = bits.position + buf bits.position = nxt # Read map index. cnt = bits.read_int32() # n+0..3 hash size. mx = bits.read_int32() # n+4..7 maximum ni. present = BitSet(bits) deleted = BitSet(bits) if not deleted.is_empty(): raise PdbDebugException( 'Unsupported PDB deleted bitset is not empty.') j = 0 for i in range(0, mx): if present.is_set(i): ns = bits.read_int32() ni = bits.read_int32() saved = bits.position bits.position = beg + ns name = bits.read_cstring() bits.position = saved result[name.upper()] = ni j += 1 if j != cnt: raise PdbDebugException('Count mismatch. (%u != %u)' % (j, cnt)) return (result, ver, sig, age, guid)
def one_set(): bset = BitSet() bset.find_dominated() print(str(bset))
def dotest(l): l.sort() x = BitSet() for i in l: x[i] = 1 self.assertEqual(x.get_bits(), l)
def find_matches(self, inverted_index): bitset = BitSet() bitset.set_indexes(inverted_index.get_doc_ids_with(self._field_id, self._value)) return bitset