def fit(self, X): if self._method_name == 'vptree': # To avoid this issue: # terminate called after throwing an instance of 'std::runtime_error' # what(): The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000 # Aborted (core dumped) self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) self._index = nmslib.init(self._nmslib_metric, [], self._method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for i, x in enumerate(X): nmslib.addDataPoint(self._index, i, x.tolist()) if os.path.exists(self._index_name): print "Loading index from file" nmslib.loadIndex(self._index, self._index_name) else: nmslib.createIndex(self._index, self._index_param) if self._save_index: nmslib.saveIndex(self._index, self._index_name) nmslib.setQueryTimeParams(self._index, self._query_param)
def create(self): if self.created: return False else: nmslib.createIndex(self.index, self.conf.index_param) nmslib.setQueryTimeParams(self.index, self.conf.query_time_param) self.created = True return True
def test_string_fresh(batch=True): DATA_STRS = ["xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) if batch: print 'DATA_STRS', DATA_STRS positions = nmslib.addDataPointBatch(index, np.arange(len(DATA_STRS), dtype=np.int32), DATA_STRS) else: for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) nmslib.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 2 if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, QUERY_STRS) for idx, data in enumerate(QUERY_STRS): res = nmslib.knnQuery(index, k, data) print idx, data, res, [DATA_STRS[i] for i in res] nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def fit(self, X): if self._method_name == 'vptree': # To avoid this issue: # terminate called after throwing an instance of 'std::runtime_error' # what(): The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000 # Aborted (core dumped) self._method_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name) self._index.addDataPointBatch(X) nmslib.createIndex(self._index, self._method_param)
def test_string_fresh(batch=True): DATA_STRS = [ "xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) if batch: print 'DATA_STRS', DATA_STRS positions = nmslib.addDataPointBatch( index, np.arange(len(DATA_STRS), dtype=np.int32), DATA_STRS) else: for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0, min(MAX_PRINT_QTY, nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) nmslib.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 2 if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, QUERY_STRS) for idx, data in enumerate(QUERY_STRS): res = nmslib.knnQuery(index, k, data) print idx, data, res, [DATA_STRS[i] for i in res] nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def fit(self, X): if self._method_name == 'vptree': self._method_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) self._index = nmslib.init('l2', [], self._method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for i, x in enumerate(X): nmslib.addDataPoint(self._index, i, x.tolist()) nmslib.createIndex(self._index, self._method_param)
def test_object_as_string_fresh(batch=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.FLOAT) if batch: data = [s for s in read_data_as_string('sample_dataset.txt')] positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data_as_string('sample_dataset.txt')): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 for idx, data in enumerate(read_data_as_string('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_object_as_string_fresh(batch=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.FLOAT) if batch: data = [s for s in read_data_as_string('sample_dataset.txt')] positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data_as_string('sample_dataset.txt')): nmslib.addDataPoint(index, id, data) print('Let\'s print a few data entries') print('We have added %d data points' % nmslib.getDataPointQty(index)) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index, i)) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] query_time_param = ['efSearch=50'] nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index,query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") k = 3 for idx, data in enumerate(read_data_as_string('sample_queryset.txt')): print(idx, nmslib.knnQuery(index, k, data)) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
def fit(self, X): if self._method_name == 'vptree': # To avoid this issue: # terminate called after throwing an instance of 'std::runtime_error' # what(): The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000 # Aborted (core dumped) self._method_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) self._index = nmslib.init(self._nmslib_metric, [], self._method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for i, x in enumerate(X): nmslib.addDataPoint(self._index, i, x.tolist()) nmslib.createIndex(self._index, self._method_param)
def test_sparse_vector_fresh(): space_type = 'cosinesimil_sparse_fast' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_sparse_data('sample_sparse_dataset.txt')): nmslib.addDataPoint(index, id, data) print('We have added %d data points' % nmslib.getDataPointQty(index)) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index,i)) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] query_time_param = ['efSearch=50'] nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index,query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") k = 3 for idx, data in enumerate(read_sparse_data('sample_sparse_queryset.txt')): print(idx, nmslib.knnQuery(index, k, data)) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
def test_sparse_vector_fresh(): space_type = 'cosinesimil_sparse' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_sparse_data('sample_sparse_dataset.txt')): nmslib.addDataPoint(index, id, data) print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 for idx, data in enumerate(read_sparse_data('sample_sparse_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def fit(self, X): self._index = nmslib.init(self._metric, [], "hnsw", nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) if os.path.exists(self._index_name): logging.debug("Loading index from file") nmslib.loadIndex(self._index, self._index_name) else: logging.debug("Create Index") for i, x in enumerate(X): self._index.addDataPoint(i, x) nmslib.createIndex(self._index, self._index_param) nmslib.saveIndex(self._index, self._index_name) nmslib.setQueryTimeParams(self._index, self._query_param)
def fit(self, X): import nmslib self._index = nmslib.init(self._nmslib_metric, [], self._method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for i, x in enumerate(X): nmslib.addDataPoint(self._index, i, x.tolist()) if os.path.exists(self._index_name): logging.debug("Loading index from file") nmslib.loadIndex(self._index, self._index_name) else: logging.debug("Create Index") nmslib.createIndex(self._index, self._index_param) if self._save_index: nmslib.saveIndex(self._index, self._index_name) nmslib.setQueryTimeParams(self._index, self._query_param)
def bench_sparse_vector(batch=True): dim = 20000 dataset = np.random.binomial(1, 0.01, size=(40000, dim)) queryset = np.random.binomial(1, 0.009, size=(1000, dim)) print 'dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0] k = 3 q0 = queryset[0] res = [] for i in range(dataset.shape[0]): res.append([i, distance.cosine(q0, dataset[i,:])]) res.sort(key=lambda x: x[1]) print 'q0 res', res[:k] data_matrix = csr_matrix(dataset, dtype=np.float32) query_matrix = csr_matrix(queryset, dtype=np.float32) data_to_return = range(dataset.shape[0]) with TimeIt('building MultiClusterIndex'): cp = snn.MultiClusterIndex(data_matrix, data_to_return) with TimeIt('knn search'): res = cp.search(query_matrix, k=k, return_distance=False) print res[:5] for i in res[0]: print int(i), distance.cosine(q0, dataset[int(i),:]) #space_type = 'cosinesimil_sparse' space_type = 'cosinesimil_sparse_fast' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) if batch: with TimeIt('batch add'): positions = nmslib.addDataPointBatch(index, np.arange(len(dataset), dtype=np.int32), data_matrix) print 'positions', positions else: d = [] q = [] with TimeIt('preparing'): for data in dataset: d.append([[i, v] for i, v in enumerate(data) if v > 0]) for data in queryset: q.append([[i, v] for i, v in enumerate(data) if v > 0]) with TimeIt('adding points'): for id, data in enumerate(d): nmslib.addDataPoint(index, id, data) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] with TimeIt('building index'): nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" with TimeIt('knn query'): if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix) for idx, v in enumerate(res): if idx < 5: print idx, v if idx == 0: for i in v: print 'q0', i, distance.cosine(q0, dataset[i,:]) else: for idx, data in enumerate(q): res = nmslib.knnQuery(index, k, data) if idx < 5: print idx, res nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_vector_fresh(fast=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) start = time.time() if fast: data = read_data_fast('sample_dataset.txt') print('data.shape', data.shape) positions = nmslib.addDataPointBatch( index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print('id %s != pos %s' % (id, pos)) sys.exit(1) end = time.time() print('added data in %s secs' % (end - start)) print('Let\'s print a few data entries') print('We have added %d data points' % nmslib.getDataPointQty(index)) print("Distance between points (0,0) " + str(nmslib.getDistance(index, 0, 0))) print("Distance between points (1,1) " + str(nmslib.getDistance(index, 1, 1))) print("Distance between points (0,1) " + str(nmslib.getDistance(index, 0, 1))) print("Distance between points (1,0) " + str(nmslib.getDistance(index, 1, 0))) for i in range(0, min(MAX_PRINT_QTY, nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index, i)) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index, query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") k = 3 start = time.time() if fast: num_threads = 10 query = read_data_fast('sample_queryset.txt') res = nmslib.knnQueryBatch(index, num_threads, k, query) for idx, v in enumerate(res): print(idx, v) else: for idx, data in enumerate(read_data('sample_queryset.txt')): print(idx, nmslib.knnQuery(index, k, data)) end = time.time() print('querying done in %s secs' % (end - start)) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
def test_vector_fresh(fast=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) start = time.time() if fast: data = read_data_fast('sample_dataset.txt') print 'data.shape', data.shape positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print 'id %s != pos %s' % (id, pos) sys.exit(1) end = time.time() print 'added data in %s secs' % (end - start) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 start = time.time() if fast: num_threads = 10 query = read_data_fast('sample_queryset.txt') res = nmslib.knnQueryBatch(index, num_threads, k, query) for idx, v in enumerate(res): print idx, v else: for idx, data in enumerate(read_data('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) end = time.time() print 'querying done in %s secs' % (end - start) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def bench_sparse_vector(batch=True): # delay importing these so CI can import module from scipy.sparse import csr_matrix from scipy.spatial import distance from pysparnn.cluster_index import MultiClusterIndex dim = 20000 dataset = np.random.binomial(1, 0.01, size=(40000, dim)) queryset = np.random.binomial(1, 0.009, size=(1000, dim)) print('dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0]) k = 3 q0 = queryset[0] res = [] for i in range(dataset.shape[0]): res.append([i, distance.cosine(q0, dataset[i, :])]) res.sort(key=lambda x: x[1]) print('q0 res', res[:k]) data_matrix = csr_matrix(dataset, dtype=np.float32) query_matrix = csr_matrix(queryset, dtype=np.float32) data_to_return = range(dataset.shape[0]) with TimeIt('building MultiClusterIndex'): cp = MultiClusterIndex(data_matrix, data_to_return) with TimeIt('knn search'): res = cp.search(query_matrix, k=k, return_distance=False) print(res[:5]) for i in res[0]: print(int(i), distance.cosine(q0, dataset[int(i), :])) #space_type = 'cosinesimil_sparse' space_type = 'cosinesimil_sparse_fast' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) if batch: with TimeIt('batch add'): positions = nmslib.addDataPointBatch( index, np.arange(len(dataset), dtype=np.int32), data_matrix) print('positions', positions) else: d = [] q = [] with TimeIt('preparing'): for data in dataset: d.append([[i, v] for i, v in enumerate(data) if v > 0]) for data in queryset: q.append([[i, v] for i, v in enumerate(data) if v > 0]) with TimeIt('adding points'): for id, data in enumerate(d): nmslib.addDataPoint(index, id, data) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] query_time_param = ['efSearch=50'] with TimeIt('building index'): nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index, query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") with TimeIt('knn query'): if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix) for idx, v in enumerate(res): if idx < 5: print(idx, v) if idx == 0: for i in v: print('q0', i, distance.cosine(q0, dataset[i, :])) else: for idx, data in enumerate(q): res = nmslib.knnQuery(index, k, data) if idx < 5: print(idx, res) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)