Exemple #1
0
def read_data(full_data_path):
	"""
	grabs files from data path
	"""
	all_files = glob(full_data_path+"/*")
	query_file = [x for x in all_files if "query" in x]
	base_file = [x for x in all_files if "base" in x]
        
	if "ivecs" in query_file:
		query_vectors = ivecs_read(query_file[0])
	else:
		query_vectors = fvecs_read(query_file[0])

	if "ivecs" in base_file:
		base_vectors = ivecs_read(base_file[0])
	else:
		base_vectors = fvecs_read(base_file[0])

	return base_vectors, query_vectors
Exemple #2
0
#Ioannis Psarros
#
import time
import utils as fr
import numpy as np
import bruteforce as bf
from dolphinn import *
num_of_probes = 20  ###########################
M = 1  ##########################

#READ FILES
#D1: data dimension, P: dataset
#D2: query dimension, Q: queryset
(D1, P) = fr.fvecs_read("siftsmall/siftsmall_base.fvecs")
(D2, Q) = fr.fvecs_read("siftsmall/siftsmall_query.fvecs")
if D1 != D2:
    raise IOError("Data points and query points are of different dimension")
D = D1

#CHANGE OF ORIGIN
#find the mean of randomly sampled points
m = fr.findmean(P, D, 10)
#then consider this mean as the origin
P = fr.isotropize(P, D, m)
Q = fr.isotropize(Q, D, m)
K = int(np.log2(len(P))) - 2  ##########################
print "New dimension K=", K

#PREPROCESSING
tic = time.clock()
dol = Dolphinn(P, D, K)
Exemple #3
0
import faiss
import numpy as np
from faiss.contrib.ondisk import merge_ondisk

from utils import fvecs_read, ivecs_read

print("loading query vectors...")
xq = fvecs_read("../gist/gist_query.fvecs")

index = faiss.read_index("../faiss/populated.index")
index.nprobe = 80
k = 5

print(f"getting nearest neighbors for {xq.shape[0]} vectors...")
distances, indices = index.search(xq, k)

# Simple benchmark of the quality of the search
iqt = ivecs_read("../gist/gist_groundtruth.ivecs")

print("Top1 accuracy on the 1-NN search: ", np.mean(indices[:, 0] == iqt[:, 0]))
Exemple #4
0
import numpy as np

from utils import fvecs_read, ivecs_read

print("loading base vectors...")
xb = fvecs_read("../gist/gist_base.fvecs")
print("loading query vectors...")
xq = fvecs_read("../gist/gist_query.fvecs")


def find_neighbors(xb, xq, k=5):
    distances = np.linalg.norm(xb - xq, axis=1)
    return np.argpartition(distances, range(0, k))[:k]


print(f"getting nearest neighbors for {xq.shape[0]} vectors...")
indices = np.zeros((xq.shape[0], 5))
for i in range(xq.shape[0]):
    indices[i, :] = find_neighbors(xb, xq[i])

# Simple benchmark of the quality of the search
iqt = ivecs_read("../gist/gist_groundtruth.ivecs")
print("Top1 accuracy on the 1-NN search: ", np.mean(indices[:, 0] == iqt[:, 0]))
Exemple #5
0
from pathlib import Path

import faiss
import numpy as np
from faiss.contrib.ondisk import merge_ondisk

from utils import fvecs_read

# create faiss directory if it doesn't exist
Path("../faiss").mkdir(parents=True, exist_ok=True)

print("loading input vectors...")
xb = fvecs_read("../gist/gist_base.fvecs")

index = faiss.index_factory(xb.shape[1], "IVF4000,Flat")

batch_size = 100000

print("training faiss index...")
index.train(xb[0:batch_size])
faiss.write_index(index, "../faiss/trained.index")

n_batches = xb.shape[0] // batch_size
for i in range(n_batches):
    index = faiss.read_index("../faiss/trained.index")
    index.add_with_ids(
        xb[i * batch_size:(i + 1) * batch_size],
        np.arange(i * batch_size, (i + 1) * batch_size),
    )
    print(f"writing block_{i}.index with {i*batch_size} as starting index")
    faiss.write_index(index, f"../faiss/block_{i}.index")