def test_vector_loaded(): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib_vector.init(space_type, space_param, method_name, nmslib_vector.DataType.VECTOR, nmslib_vector.DistType.FLOAT) for id, data in enumerate(read_data('sample_dataset.txt')): nmslib_vector.addDataPoint(index, id, data) query_time_param = ['initSearchAttempts=3'] nmslib_vector.loadIndex(index, index_name) print "The index %s is loaded" % index_name nmslib_vector.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the loaded index" k = 2 for idx, data in enumerate(read_data('sample_queryset.txt')): print idx, nmslib_vector.knnQuery(index, k, data) nmslib_vector.freeIndex(index)
def test_string_fresh(): DATA_STRS = [ "xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib_vector.init(space_type, space_param, method_name, nmslib_vector.DataType.STRING, nmslib_vector.DistType.INT) for id, data in enumerate(DATA_STRS): nmslib_vector.addDataPoint(index, id, data) index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib_vector.createIndex(index, index_param) nmslib_vector.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 2 for idx, data in enumerate(QUERY_STRS): print idx, nmslib_vector.knnQuery(index, k, data) nmslib_vector.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib_vector.freeIndex(index)
def test_vector_loaded(): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib_vector.init( space_type, space_param, method_name, nmslib_vector.DataType.VECTOR, nmslib_vector.DistType.FLOAT) for id, data in enumerate(read_data('sample_dataset.txt')): nmslib_vector.addDataPoint(index, id, data) query_time_param = ['initSearchAttempts=3'] nmslib_vector.loadIndex(index, index_name) print "The index %s is loaded" % index_name nmslib_vector.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the loaded index" k = 2 for idx, data in enumerate(read_data('sample_queryset.txt')): print idx, nmslib_vector.knnQuery(index, k, data) nmslib_vector.freeIndex(index)
def fit(self, X): os.environ['OMP_THREAD_LIMIT'] = '40' import nmslib_vector if self._method_name == 'vptree': # To avoid this issue: # terminate called after throwing an instance of 'std::runtime_error' # what(): The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000 # Aborted (core dumped) self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) self._index = nmslib_vector.init(self._nmslib_metric, [], self._method_name, nmslib_vector.DataType.VECTOR, nmslib_vector.DistType.FLOAT) for i, x in enumerate(X): nmslib_vector.addDataPoint(self._index, i, x.tolist()) if os.path.exists(self._index_name): print "Loading index from file" nmslib_vector.loadIndex(self._index, self._index_name) else: nmslib_vector.createIndex(self._index, self._index_param) # nmslib_vector.saveIndex(self._index, self._index_name) nmslib_vector.setQueryTimeParams(self._index, self._query_param) os.environ['OMP_THREAD_LIMIT'] = '1'
def fit(self, X): import nmslib_vector if self._method_name == 'vptree': # To avoid this issue: # terminate called after throwing an instance of 'std::runtime_error' # what(): The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000 # Aborted (core dumped) self._method_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) self._index = nmslib_vector.init(self._nmslib_metric, [], self._method_name, nmslib_vector.DataType.VECTOR, nmslib_vector.DistType.FLOAT) for i, x in enumerate(X): nmslib_vector.addDataPoint(self._index, i, x.tolist()) nmslib_vector.createIndex(self._index, self._method_param)
def load_index(col_name, category, index_version, reindex=False): space_type = 'jsdivslow' space_param = [] method_name = 'small_world_rand' index_name = 'indexes/' + col_name + '_' + category + index_version + '.index' file_exists = isfile(index_name) if reindex or not file_exists: build_n_save(col_name, category, index_version) index = nmslib_vector.init(space_type, space_param, method_name, nmslib_vector.DataType.VECTOR, nmslib_vector.DistType.FLOAT) nmslib_index = 'nmslib_index' + index_version all_items_in_category = db[col_name].find({ 'categories': category, nmslib_index: { '$exists': 1 } }) t1 = time() for idx, item in enumerate(all_items_in_category): fp = item['fingerprint'] if type(fp) == list: color = fp elif type(fp) == dict: color = fp['color'] else: print('else') continue nmslib_vector.addDataPoint(index, idx, color) # item_id = item['_id'] # db[col_name].update_one({'_id':item_id}, {'$set': {'nmslib_index': idx}}) t2 = time() print('addDataPoints took %s secs' % str(t2 - t1)) # index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=32'] query_time_param = ['initSearchAttempts=3'] nmslib_vector.loadIndex(index, index_name) print "The index %s is loaded" % index_name t3 = time() print('createIndex took %s secs' % str(t3 - t2)) nmslib_vector.setQueryTimeParams(index, query_time_param) return index, nmslib_vector
def create_index(col_name, category): space_type = 'jsdivslow' space_param = [] method_name = 'small_world_rand' index_name = col_name + '_' + category + '.index' index = nmslib_vector.init(space_type, space_param, method_name, nmslib_vector.DataType.VECTOR, nmslib_vector.DistType.FLOAT) all_items_in_category = db[col_name].find({'categories': category}) t1 = time() for idx, item in enumerate(all_items_in_category): fp = item['fingerprint'] if type(fp) == list: color = fp elif type(fp) == dict: color = fp['color'] else: print('else') continue nmslib_vector.addDataPoint(index, idx, color) item_id = item['_id'] db[col_name].update_one({'_id': item_id}, {'$set': { 'nmslib_index': idx }}) t2 = time() print('addDataPoints took %s secs' % str(t2 - t1)) index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=32'] query_time_param = ['initSearchAttempts=3'] nmslib_vector.createIndex(index, index_param) t3 = time() print('createIndex took %s secs' % str(t3 - t2)) nmslib_vector.setQueryTimeParams(index, query_time_param) nmslib_vector.saveIndex(index, index_name) return index, nmslib_vector
def test_string_fresh(): DATA_STRS = ["xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib_vector.init( space_type, space_param, method_name, nmslib_vector.DataType.STRING, nmslib_vector.DistType.INT) for id, data in enumerate(DATA_STRS): nmslib_vector.addDataPoint(index, id, data) index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib_vector.createIndex(index, index_param) nmslib_vector.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 2 for idx, data in enumerate(QUERY_STRS): print idx, nmslib_vector.knnQuery(index, k, data) nmslib_vector.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib_vector.freeIndex(index)
def add_vector(self, index, _id, vector): if index[1] is None: index[1] = vector.shape assert index[1] == vector.shape nmslib_vector.addDataPoint(index[0], _id, list(vector))