Beispiel #1
0
 def __init__(self, threshold=0.9, num_perm=128, weights=(0.5,0.5),
              params=None, storage_config={'type': 'dict'}, prepickle=None):
     if threshold > 1.0 or threshold < 0.0:
         raise ValueError("threshold must be in [0.0, 1.0]") 
     if num_perm < 2:
         raise ValueError("Too few permutation functions")
     if any(w < 0.0 or w > 1.0 for w in weights):
         raise ValueError("Weight must be in [0.0, 1.0]")
     if sum(weights) != 1.0:
         raise ValueError("Weights must sum to 1.0")
     self.h = num_perm
     if params is not None:
         self.b, self.r = params
         if self.b * self.r > num_perm:
             raise ValueError("The product of b and r must be less than num_perm")
     else:
         false_positive_weight, false_negative_weight = weights
         self.b, self.r = _optimal_param(threshold, num_perm,
                 false_positive_weight, false_negative_weight)
     if prepickle is None:
         self.prepickle = storage_config['type'] == 'redis'
     else:
         self.prepickle = prepickle
     basename = _random_name(11)
     self.hashtables = [
         unordered_storage(storage_config, name=basename + b'_bucket_' + bytes([i]))
         for i in range(self.b)]
     self.hashranges = [(i*self.r, (i+1)*self.r) for i in range(self.b)]
     self.keys = ordered_storage(storage_config, name=basename + b'_keys')
Beispiel #2
0
    def __init__(self, threshold=0.9, num_perm=128, weights=(0.5, 0.5),
                 params=None, storage_config=None, prepickle=None):
        storage_config = {'type': 'dict'} if not storage_config else storage_config
        self._buffer_size = 50000
        if threshold > 1.0 or threshold < 0.0:
            raise ValueError("threshold must be in [0.0, 1.0]")
        if num_perm < 2:
            raise ValueError("Too few permutation functions")
        if any(w < 0.0 or w > 1.0 for w in weights):
            raise ValueError("Weight must be in [0.0, 1.0]")
        if sum(weights) != 1.0:
            raise ValueError("Weights must sum to 1.0")
        self.h = num_perm
        if params is not None:
            self.b, self.r = params
            if self.b * self.r > num_perm:
                raise ValueError("The product of b and r in params is "
                        "{} * {} = {} -- it must be less than num_perm {}. "
                        "Did you forget to specify num_perm?".format(
                            self.b, self.r, self.b*self.r, num_perm))
        else:
            false_positive_weight, false_negative_weight = weights
            self.b, self.r = _optimal_param(threshold, num_perm,
                    false_positive_weight, false_negative_weight)

        self.prepickle = storage_config['type'] == 'redis' if prepickle is None else prepickle

        basename = storage_config.get('basename', _random_name(11))
        self.hashtables = [
            unordered_storage(storage_config, name=b''.join([basename, b'_bucket_', struct.pack('>H', i)]))
            for i in range(self.b)]
        self.hashranges = [(i*self.r, (i+1)*self.r) for i in range(self.b)]
        self.keys = ordered_storage(storage_config, name=b''.join([basename, b'_keys']))
Beispiel #3
0
        def __init__(self, config, name=None):
            assert config[
                'type'] == 'aiomongo', 'Storage type <{}> not supported'.format(
                    config['type'])
            self._config = config
            self._mongo_param = self._parse_config(self._config['mongo'])

            self._name = name if name else _random_name(11).decode('utf-8')
            self._collection_name = 'lsh_' + self._name

            db_lsh = self.mongo_param[
                'db'] if 'db' in self.mongo_param else 'db_0'
            if 'replica_set' in self.mongo_param:
                dsn = 'mongodb://{replica_set_nodes}/?replicaSet={replica_set}'.format(
                    **self.mongo_param)
            elif 'username' in self.mongo_param or 'password' in self.mongo_param:
                dsn = 'mongodb://{username}:{password}@{host}:{port}'.format(
                    **self.mongo_param)
            else:
                dsn = 'mongodb://{host}:{port}'.format(**self.mongo_param)

            self._batch_size = 1000
            self._mongo_client = motor.motor_asyncio.AsyncIOMotorClient(dsn)
            self._collection = self._mongo_client[db_lsh][
                self._collection_name]
            self._initialized = True
            self._buffer = AsyncMongoBuffer(self._collection, self._batch_size)
Beispiel #4
0
    def __init__(self,
                 threshold=0.9,
                 num_perm=128,
                 weights=(0.5, 0.5),
                 params=None,
                 storage_config=None,
                 prepickle=None):
        if storage_config is None:
            storage_config = {
                'type': 'aioredis',
                'redis': {
                    'host': 'localhost',
                    'port': 6379
                }
            }
        self._storage_config = storage_config.copy()
        self._storage_config['basename'] = self._storage_config.get(
            'basename', _random_name(11))
        self._basename = self._storage_config['basename']
        self._batch_size = 10000
        self._threshold = threshold
        self._num_perm = num_perm
        self._weights = weights
        self._params = params
        self._prepickle = storage_config[
            'type'] == 'aioredis' if prepickle is None else prepickle

        if self._threshold > 1.0 or self._threshold < 0.0:
            raise ValueError("threshold must be in [0.0, 1.0]")
        if self._num_perm < 2:
            raise ValueError("Too few permutation functions")
        if any(w < 0.0 or w > 1.0 for w in self._weights):
            raise ValueError("Weight must be in [0.0, 1.0]")
        if sum(self._weights) != 1.0:
            raise ValueError("Weights must sum to 1.0")
        self.h = self._num_perm
        if self._params is not None:
            self.b, self.r = self._params
            if self.b * self.r > self._num_perm:
                raise ValueError("The product of b and r must be less than "
                                 "num_perm")
        else:
            false_positive_weight, false_negative_weight = self._weights
            self.b, self.r = _optimal_param(self._threshold, self._num_perm,
                                            false_positive_weight,
                                            false_negative_weight)

        self.hashranges = [(i * self.r, (i + 1) * self.r)
                           for i in range(self.b)]
        self.hashtables = None
        self.keys = None

        self._lock = asyncio.Lock()
        self._initialized = False
Beispiel #5
0
        def __init__(self, config, name=None):
            self.config = config
            self._batch_size = 50000
            self._redis_param = self._parse_config(self.config['redis'])

            if name is None:
                name = _random_name(11)
            self._name = name

            self._lock = asyncio.Lock()
            self._initialized = False
            self._redis = None
            self._buffer = None
Beispiel #6
0
        def __init__(self, config, name=None):
            assert config[
                'type'] == 'aiomongo', 'Storage type <{}> not supported'.format(
                    config['type'])
            self._config = config
            self._mongo_param = self._parse_config(self._config['mongo'])

            self._name = name if name else _random_name(11).decode('utf-8')
            if 'collection_name' in self.mongo_param:
                self._collection_name = self.mongo_param['collection_name']
            elif 'collection_prefix' in self.mongo_param:
                self._collection_name = self.mongo_param[
                    'collection_prefix'] + self._name
            else:
                self._collection_name = 'lsh_' + self._name

            db_lsh = self.mongo_param[
                'db'] if 'db' in self.mongo_param else 'db_0'
            if 'url' in self.mongo_param:
                dsn = self.mongo_param['url']
            elif 'replica_set' in self.mongo_param:
                dsn = 'mongodb://{replica_set_nodes}/?replicaSet={replica_set}'.format(
                    **self.mongo_param)
            elif 'username' in self.mongo_param or 'password' in self.mongo_param:
                dsn = 'mongodb://{username}:{password}@{host}:{port}'.format(
                    **self.mongo_param)
            else:
                dsn = 'mongodb://{host}:{port}'.format(**self.mongo_param)

            additional_args = self.mongo_param.get('args', {})

            self._batch_size = 1000
            self._mongo_client = motor.motor_asyncio.AsyncIOMotorClient(
                dsn, **additional_args)
            self._collection = self._mongo_client.get_default_database(
                db_lsh).get_collection(self._collection_name)
            self._collection.create_index("key", background=True)

            self._initialized = True
            self._buffer = AsyncMongoBuffer(self._collection, self._batch_size)
Beispiel #7
0
 def __init__(self,
              threshold=0.9,
              num_perm=128,
              num_part=16,
              m=8,
              weights=(0.5, 0.5),
              storage_config=None,
              prepickle=None):
     if threshold > 1.0 or threshold < 0.0:
         raise ValueError("threshold must be in [0.0, 1.0]")
     if num_perm < 2:
         raise ValueError("Too few permutation functions")
     if num_part < 1:
         raise ValueError("num_part must be at least 1")
     if m < 2 or m > num_perm:
         raise ValueError("m must be in the range of [2, num_perm]")
     if any(w < 0.0 or w > 1.0 for w in weights):
         raise ValueError("Weight must be in [0.0, 1.0]")
     if sum(weights) != 1.0:
         raise ValueError("Weights must sum to 1.0")
     self.threshold = threshold
     self.h = num_perm
     self.m = m
     rs = self._init_optimal_params(weights)
     # Initialize multiple LSH indexes for each partition
     storage_config = {
         'type': 'dict'
     } if not storage_config else storage_config
     basename = storage_config.get('basename', _random_name(11))
     self.indexes = [
         dict((r,
               MinHashLSH(num_perm=self.h,
                          params=(int(self.h / r), r),
                          storage_config=self._get_storage_config(
                              basename, storage_config, partition, r),
                          prepickle=prepickle)) for r in rs)
         for partition in range(0, num_part)
     ]
     self.lowers = [None for _ in self.indexes]
     self.uppers = [None for _ in self.indexes]