def __init__(self, threshold=0.9, num_perm=128, weights=(0.5,0.5), params=None, storage_config={'type': 'dict'}, prepickle=None): if threshold > 1.0 or threshold < 0.0: raise ValueError("threshold must be in [0.0, 1.0]") if num_perm < 2: raise ValueError("Too few permutation functions") if any(w < 0.0 or w > 1.0 for w in weights): raise ValueError("Weight must be in [0.0, 1.0]") if sum(weights) != 1.0: raise ValueError("Weights must sum to 1.0") self.h = num_perm if params is not None: self.b, self.r = params if self.b * self.r > num_perm: raise ValueError("The product of b and r must be less than num_perm") else: false_positive_weight, false_negative_weight = weights self.b, self.r = _optimal_param(threshold, num_perm, false_positive_weight, false_negative_weight) if prepickle is None: self.prepickle = storage_config['type'] == 'redis' else: self.prepickle = prepickle basename = _random_name(11) self.hashtables = [ unordered_storage(storage_config, name=basename + b'_bucket_' + bytes([i])) for i in range(self.b)] self.hashranges = [(i*self.r, (i+1)*self.r) for i in range(self.b)] self.keys = ordered_storage(storage_config, name=basename + b'_keys')
def __init__(self, threshold=0.9, num_perm=128, weights=(0.5, 0.5), params=None, storage_config=None, prepickle=None): storage_config = {'type': 'dict'} if not storage_config else storage_config self._buffer_size = 50000 if threshold > 1.0 or threshold < 0.0: raise ValueError("threshold must be in [0.0, 1.0]") if num_perm < 2: raise ValueError("Too few permutation functions") if any(w < 0.0 or w > 1.0 for w in weights): raise ValueError("Weight must be in [0.0, 1.0]") if sum(weights) != 1.0: raise ValueError("Weights must sum to 1.0") self.h = num_perm if params is not None: self.b, self.r = params if self.b * self.r > num_perm: raise ValueError("The product of b and r in params is " "{} * {} = {} -- it must be less than num_perm {}. " "Did you forget to specify num_perm?".format( self.b, self.r, self.b*self.r, num_perm)) else: false_positive_weight, false_negative_weight = weights self.b, self.r = _optimal_param(threshold, num_perm, false_positive_weight, false_negative_weight) self.prepickle = storage_config['type'] == 'redis' if prepickle is None else prepickle basename = storage_config.get('basename', _random_name(11)) self.hashtables = [ unordered_storage(storage_config, name=b''.join([basename, b'_bucket_', struct.pack('>H', i)])) for i in range(self.b)] self.hashranges = [(i*self.r, (i+1)*self.r) for i in range(self.b)] self.keys = ordered_storage(storage_config, name=b''.join([basename, b'_keys']))
def __init__(self, config, name=None): assert config[ 'type'] == 'aiomongo', 'Storage type <{}> not supported'.format( config['type']) self._config = config self._mongo_param = self._parse_config(self._config['mongo']) self._name = name if name else _random_name(11).decode('utf-8') self._collection_name = 'lsh_' + self._name db_lsh = self.mongo_param[ 'db'] if 'db' in self.mongo_param else 'db_0' if 'replica_set' in self.mongo_param: dsn = 'mongodb://{replica_set_nodes}/?replicaSet={replica_set}'.format( **self.mongo_param) elif 'username' in self.mongo_param or 'password' in self.mongo_param: dsn = 'mongodb://{username}:{password}@{host}:{port}'.format( **self.mongo_param) else: dsn = 'mongodb://{host}:{port}'.format(**self.mongo_param) self._batch_size = 1000 self._mongo_client = motor.motor_asyncio.AsyncIOMotorClient(dsn) self._collection = self._mongo_client[db_lsh][ self._collection_name] self._initialized = True self._buffer = AsyncMongoBuffer(self._collection, self._batch_size)
def __init__(self, threshold=0.9, num_perm=128, weights=(0.5, 0.5), params=None, storage_config=None, prepickle=None): if storage_config is None: storage_config = { 'type': 'aioredis', 'redis': { 'host': 'localhost', 'port': 6379 } } self._storage_config = storage_config.copy() self._storage_config['basename'] = self._storage_config.get( 'basename', _random_name(11)) self._basename = self._storage_config['basename'] self._batch_size = 10000 self._threshold = threshold self._num_perm = num_perm self._weights = weights self._params = params self._prepickle = storage_config[ 'type'] == 'aioredis' if prepickle is None else prepickle if self._threshold > 1.0 or self._threshold < 0.0: raise ValueError("threshold must be in [0.0, 1.0]") if self._num_perm < 2: raise ValueError("Too few permutation functions") if any(w < 0.0 or w > 1.0 for w in self._weights): raise ValueError("Weight must be in [0.0, 1.0]") if sum(self._weights) != 1.0: raise ValueError("Weights must sum to 1.0") self.h = self._num_perm if self._params is not None: self.b, self.r = self._params if self.b * self.r > self._num_perm: raise ValueError("The product of b and r must be less than " "num_perm") else: false_positive_weight, false_negative_weight = self._weights self.b, self.r = _optimal_param(self._threshold, self._num_perm, false_positive_weight, false_negative_weight) self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)] self.hashtables = None self.keys = None self._lock = asyncio.Lock() self._initialized = False
def __init__(self, config, name=None): self.config = config self._batch_size = 50000 self._redis_param = self._parse_config(self.config['redis']) if name is None: name = _random_name(11) self._name = name self._lock = asyncio.Lock() self._initialized = False self._redis = None self._buffer = None
def __init__(self, config, name=None): assert config[ 'type'] == 'aiomongo', 'Storage type <{}> not supported'.format( config['type']) self._config = config self._mongo_param = self._parse_config(self._config['mongo']) self._name = name if name else _random_name(11).decode('utf-8') if 'collection_name' in self.mongo_param: self._collection_name = self.mongo_param['collection_name'] elif 'collection_prefix' in self.mongo_param: self._collection_name = self.mongo_param[ 'collection_prefix'] + self._name else: self._collection_name = 'lsh_' + self._name db_lsh = self.mongo_param[ 'db'] if 'db' in self.mongo_param else 'db_0' if 'url' in self.mongo_param: dsn = self.mongo_param['url'] elif 'replica_set' in self.mongo_param: dsn = 'mongodb://{replica_set_nodes}/?replicaSet={replica_set}'.format( **self.mongo_param) elif 'username' in self.mongo_param or 'password' in self.mongo_param: dsn = 'mongodb://{username}:{password}@{host}:{port}'.format( **self.mongo_param) else: dsn = 'mongodb://{host}:{port}'.format(**self.mongo_param) additional_args = self.mongo_param.get('args', {}) self._batch_size = 1000 self._mongo_client = motor.motor_asyncio.AsyncIOMotorClient( dsn, **additional_args) self._collection = self._mongo_client.get_default_database( db_lsh).get_collection(self._collection_name) self._collection.create_index("key", background=True) self._initialized = True self._buffer = AsyncMongoBuffer(self._collection, self._batch_size)
def __init__(self, threshold=0.9, num_perm=128, num_part=16, m=8, weights=(0.5, 0.5), storage_config=None, prepickle=None): if threshold > 1.0 or threshold < 0.0: raise ValueError("threshold must be in [0.0, 1.0]") if num_perm < 2: raise ValueError("Too few permutation functions") if num_part < 1: raise ValueError("num_part must be at least 1") if m < 2 or m > num_perm: raise ValueError("m must be in the range of [2, num_perm]") if any(w < 0.0 or w > 1.0 for w in weights): raise ValueError("Weight must be in [0.0, 1.0]") if sum(weights) != 1.0: raise ValueError("Weights must sum to 1.0") self.threshold = threshold self.h = num_perm self.m = m rs = self._init_optimal_params(weights) # Initialize multiple LSH indexes for each partition storage_config = { 'type': 'dict' } if not storage_config else storage_config basename = storage_config.get('basename', _random_name(11)) self.indexes = [ dict((r, MinHashLSH(num_perm=self.h, params=(int(self.h / r), r), storage_config=self._get_storage_config( basename, storage_config, partition, r), prepickle=prepickle)) for r in rs) for partition in range(0, num_part) ] self.lowers = [None for _ in self.indexes] self.uppers = [None for _ in self.indexes]