def read_data(full_data_path): """ grabs files from data path """ all_files = glob(full_data_path+"/*") query_file = [x for x in all_files if "query" in x] base_file = [x for x in all_files if "base" in x] if "ivecs" in query_file: query_vectors = ivecs_read(query_file[0]) else: query_vectors = fvecs_read(query_file[0]) if "ivecs" in base_file: base_vectors = ivecs_read(base_file[0]) else: base_vectors = fvecs_read(base_file[0]) return base_vectors, query_vectors
#Ioannis Psarros # import time import utils as fr import numpy as np import bruteforce as bf from dolphinn import * num_of_probes = 20 ########################### M = 1 ########################## #READ FILES #D1: data dimension, P: dataset #D2: query dimension, Q: queryset (D1, P) = fr.fvecs_read("siftsmall/siftsmall_base.fvecs") (D2, Q) = fr.fvecs_read("siftsmall/siftsmall_query.fvecs") if D1 != D2: raise IOError("Data points and query points are of different dimension") D = D1 #CHANGE OF ORIGIN #find the mean of randomly sampled points m = fr.findmean(P, D, 10) #then consider this mean as the origin P = fr.isotropize(P, D, m) Q = fr.isotropize(Q, D, m) K = int(np.log2(len(P))) - 2 ########################## print "New dimension K=", K #PREPROCESSING tic = time.clock() dol = Dolphinn(P, D, K)
import faiss import numpy as np from faiss.contrib.ondisk import merge_ondisk from utils import fvecs_read, ivecs_read print("loading query vectors...") xq = fvecs_read("../gist/gist_query.fvecs") index = faiss.read_index("../faiss/populated.index") index.nprobe = 80 k = 5 print(f"getting nearest neighbors for {xq.shape[0]} vectors...") distances, indices = index.search(xq, k) # Simple benchmark of the quality of the search iqt = ivecs_read("../gist/gist_groundtruth.ivecs") print("Top1 accuracy on the 1-NN search: ", np.mean(indices[:, 0] == iqt[:, 0]))
import numpy as np from utils import fvecs_read, ivecs_read print("loading base vectors...") xb = fvecs_read("../gist/gist_base.fvecs") print("loading query vectors...") xq = fvecs_read("../gist/gist_query.fvecs") def find_neighbors(xb, xq, k=5): distances = np.linalg.norm(xb - xq, axis=1) return np.argpartition(distances, range(0, k))[:k] print(f"getting nearest neighbors for {xq.shape[0]} vectors...") indices = np.zeros((xq.shape[0], 5)) for i in range(xq.shape[0]): indices[i, :] = find_neighbors(xb, xq[i]) # Simple benchmark of the quality of the search iqt = ivecs_read("../gist/gist_groundtruth.ivecs") print("Top1 accuracy on the 1-NN search: ", np.mean(indices[:, 0] == iqt[:, 0]))
from pathlib import Path import faiss import numpy as np from faiss.contrib.ondisk import merge_ondisk from utils import fvecs_read # create faiss directory if it doesn't exist Path("../faiss").mkdir(parents=True, exist_ok=True) print("loading input vectors...") xb = fvecs_read("../gist/gist_base.fvecs") index = faiss.index_factory(xb.shape[1], "IVF4000,Flat") batch_size = 100000 print("training faiss index...") index.train(xb[0:batch_size]) faiss.write_index(index, "../faiss/trained.index") n_batches = xb.shape[0] // batch_size for i in range(n_batches): index = faiss.read_index("../faiss/trained.index") index.add_with_ids( xb[i * batch_size:(i + 1) * batch_size], np.arange(i * batch_size, (i + 1) * batch_size), ) print(f"writing block_{i}.index with {i*batch_size} as starting index") faiss.write_index(index, f"../faiss/block_{i}.index")