def __init__(self, threshold=0.9, num_perm=128, weights=(0.5,0.5), params=None, storage_config={'type': 'dict'}, prepickle=None): if threshold > 1.0 or threshold < 0.0: raise ValueError("threshold must be in [0.0, 1.0]") if num_perm < 2: raise ValueError("Too few permutation functions") if any(w < 0.0 or w > 1.0 for w in weights): raise ValueError("Weight must be in [0.0, 1.0]") if sum(weights) != 1.0: raise ValueError("Weights must sum to 1.0") self.h = num_perm if params is not None: self.b, self.r = params if self.b * self.r > num_perm: raise ValueError("The product of b and r must be less than num_perm") else: false_positive_weight, false_negative_weight = weights self.b, self.r = _optimal_param(threshold, num_perm, false_positive_weight, false_negative_weight) if prepickle is None: self.prepickle = storage_config['type'] == 'redis' else: self.prepickle = prepickle basename = _random_name(11) self.hashtables = [ unordered_storage(storage_config, name=basename + b'_bucket_' + bytes([i])) for i in range(self.b)] self.hashranges = [(i*self.r, (i+1)*self.r) for i in range(self.b)] self.keys = ordered_storage(storage_config, name=basename + b'_keys')
def __init__(self, threshold=0.9, num_perm=128, weights=(0.5, 0.5), params=None, storage_config=None, prepickle=None): storage_config = {'type': 'dict'} if not storage_config else storage_config self._buffer_size = 50000 if threshold > 1.0 or threshold < 0.0: raise ValueError("threshold must be in [0.0, 1.0]") if num_perm < 2: raise ValueError("Too few permutation functions") if any(w < 0.0 or w > 1.0 for w in weights): raise ValueError("Weight must be in [0.0, 1.0]") if sum(weights) != 1.0: raise ValueError("Weights must sum to 1.0") self.h = num_perm if params is not None: self.b, self.r = params if self.b * self.r > num_perm: raise ValueError("The product of b and r in params is " "{} * {} = {} -- it must be less than num_perm {}. " "Did you forget to specify num_perm?".format( self.b, self.r, self.b*self.r, num_perm)) else: false_positive_weight, false_negative_weight = weights self.b, self.r = _optimal_param(threshold, num_perm, false_positive_weight, false_negative_weight) self.prepickle = storage_config['type'] == 'redis' if prepickle is None else prepickle basename = storage_config.get('basename', _random_name(11)) self.hashtables = [ unordered_storage(storage_config, name=b''.join([basename, b'_bucket_', struct.pack('>H', i)])) for i in range(self.b)] self.hashranges = [(i*self.r, (i+1)*self.r) for i in range(self.b)] self.keys = ordered_storage(storage_config, name=b''.join([basename, b'_keys']))
def _init_hashtables(self): """ Initialize the hash tables such that each record will be in the form of "[storage1, storage2, ...]" """ self.hashtables = [ unordered_storage({'type': 'dict'}) for _ in range(self.num_hashtables) ]
async def get_subset_counts(self, *keys): """ see :class:`datasketch.MinHashLSH`. """ key_set = list(set(keys)) hashtables = [ unordered_storage({'type': 'dict'}) for _ in range(self.b) ] Hss = await self.keys.getmany(*key_set) for key, Hs in zip(key_set, Hss): for H, hashtable in zip(Hs, hashtables): hashtable.insert(H, key) return [hashtable.itemcounts() for hashtable in hashtables]
def get_subset_counts(self, *keys): ''' Returns the bucket allocation counts (see :ref:`get_counts` above) restricted to the list of keys given. Args: keys (hashable) : the keys for which to get the bucket allocation counts ''' if self.prepickle: key_set = [pickle.dumps(key) for key in set(keys)] else: key_set = list(set(keys)) hashtables = [unordered_storage({'type': 'dict'}) for _ in range(self.b)] Hss = self.keys.getmany(*key_set) for key, Hs in zip(key_set, Hss): for H, hashtable in zip(Hs, hashtables): hashtable.insert(H, key) return [hashtable.itemcounts() for hashtable in hashtables]
def __init__(self, threshold=0.9, num_perm=128, params=None): self._buffer_size = 50000 storage_config = {'type': 'dict'} if threshold > 1.0 or threshold < 0.0: raise ValueError("threshold must be in [0.0, 1.0]") if num_perm < 2: raise ValueError("Too few permutation functions") # Band and rows. self.h = num_perm if params is not None: self.b, self.r = params if self.b * self.r > num_perm: raise ValueError("Band and Row must less than num_perm") # Hashtables self.hashtables = [unordered_storage(storage_config) for i in range(self.b)] self.hashranges = [(i*self.r, (i+1)*self.r) for i in range(self.b)] self.keys = ordered_storage(storage_config)