Beispiel #1
0
 def __init__(self, threshold=0.9, num_perm=128, weights=(0.5,0.5),
              params=None, storage_config={'type': 'dict'}, prepickle=None):
     if threshold > 1.0 or threshold < 0.0:
         raise ValueError("threshold must be in [0.0, 1.0]") 
     if num_perm < 2:
         raise ValueError("Too few permutation functions")
     if any(w < 0.0 or w > 1.0 for w in weights):
         raise ValueError("Weight must be in [0.0, 1.0]")
     if sum(weights) != 1.0:
         raise ValueError("Weights must sum to 1.0")
     self.h = num_perm
     if params is not None:
         self.b, self.r = params
         if self.b * self.r > num_perm:
             raise ValueError("The product of b and r must be less than num_perm")
     else:
         false_positive_weight, false_negative_weight = weights
         self.b, self.r = _optimal_param(threshold, num_perm,
                 false_positive_weight, false_negative_weight)
     if prepickle is None:
         self.prepickle = storage_config['type'] == 'redis'
     else:
         self.prepickle = prepickle
     basename = _random_name(11)
     self.hashtables = [
         unordered_storage(storage_config, name=basename + b'_bucket_' + bytes([i]))
         for i in range(self.b)]
     self.hashranges = [(i*self.r, (i+1)*self.r) for i in range(self.b)]
     self.keys = ordered_storage(storage_config, name=basename + b'_keys')
Beispiel #2
0
    def __init__(self, threshold=0.9, num_perm=128, weights=(0.5, 0.5),
                 params=None, storage_config=None, prepickle=None):
        storage_config = {'type': 'dict'} if not storage_config else storage_config
        self._buffer_size = 50000
        if threshold > 1.0 or threshold < 0.0:
            raise ValueError("threshold must be in [0.0, 1.0]")
        if num_perm < 2:
            raise ValueError("Too few permutation functions")
        if any(w < 0.0 or w > 1.0 for w in weights):
            raise ValueError("Weight must be in [0.0, 1.0]")
        if sum(weights) != 1.0:
            raise ValueError("Weights must sum to 1.0")
        self.h = num_perm
        if params is not None:
            self.b, self.r = params
            if self.b * self.r > num_perm:
                raise ValueError("The product of b and r in params is "
                        "{} * {} = {} -- it must be less than num_perm {}. "
                        "Did you forget to specify num_perm?".format(
                            self.b, self.r, self.b*self.r, num_perm))
        else:
            false_positive_weight, false_negative_weight = weights
            self.b, self.r = _optimal_param(threshold, num_perm,
                    false_positive_weight, false_negative_weight)

        self.prepickle = storage_config['type'] == 'redis' if prepickle is None else prepickle

        basename = storage_config.get('basename', _random_name(11))
        self.hashtables = [
            unordered_storage(storage_config, name=b''.join([basename, b'_bucket_', struct.pack('>H', i)]))
            for i in range(self.b)]
        self.hashranges = [(i*self.r, (i+1)*self.r) for i in range(self.b)]
        self.keys = ordered_storage(storage_config, name=b''.join([basename, b'_keys']))
Beispiel #3
0
    def _init_hashtables(self):
        """ Initialize the hash tables such that each record will be in the
        form of "[storage1, storage2, ...]" """

        self.hashtables = [
            unordered_storage({'type': 'dict'})
            for _ in range(self.num_hashtables)
        ]
Beispiel #4
0
 async def get_subset_counts(self, *keys):
     """
     see :class:`datasketch.MinHashLSH`.
     """
     key_set = list(set(keys))
     hashtables = [
         unordered_storage({'type': 'dict'}) for _ in range(self.b)
     ]
     Hss = await self.keys.getmany(*key_set)
     for key, Hs in zip(key_set, Hss):
         for H, hashtable in zip(Hs, hashtables):
             hashtable.insert(H, key)
     return [hashtable.itemcounts() for hashtable in hashtables]
Beispiel #5
0
    def get_subset_counts(self, *keys):
        '''
        Returns the bucket allocation counts (see :ref:`get_counts` above)
        restricted to the list of keys given.

        Args:
            keys (hashable) : the keys for which to get the bucket allocation
                counts
        '''
        if self.prepickle:
            key_set = [pickle.dumps(key) for key in set(keys)]
        else:
            key_set = list(set(keys))
        hashtables = [unordered_storage({'type': 'dict'}) for _ in
                      range(self.b)]
        Hss = self.keys.getmany(*key_set)
        for key, Hs in zip(key_set, Hss):
            for H, hashtable in zip(Hs, hashtables):
                hashtable.insert(H, key)
        return [hashtable.itemcounts() for hashtable in hashtables]
    def __init__(self, threshold=0.9, num_perm=128, params=None):
        self._buffer_size = 50000
        storage_config = {'type': 'dict'}

        if threshold > 1.0 or threshold < 0.0:
            raise ValueError("threshold must be in [0.0, 1.0]")
        if num_perm < 2:
            raise ValueError("Too few permutation functions")

        # Band and rows.
        self.h = num_perm
        if params is not None:
            self.b, self.r = params
            if self.b * self.r > num_perm:
                raise ValueError("Band and Row must less than num_perm")

        # Hashtables
        self.hashtables = [unordered_storage(storage_config) for i in range(self.b)]
        self.hashranges = [(i*self.r, (i+1)*self.r) for i in range(self.b)]
        self.keys = ordered_storage(storage_config)