def __init__(self, metric: str, dimension: int): self.name = f"eknn-exact-metric={metric}_dimension={dimension}" self.metric = metric self.dimension = dimension self.model = ElastiknnModel("exact", dealias_metric(metric)) self.batch_res = None es_wait()
def __init__(self, L: int, k: int, w: int): self.name_prefix = f"eknn-l2lsh-L={L}-k={k}-w={w}" self.name = None # set based on query args. self.model = ElastiknnModel("lsh", "l2", mapping_params=dict(L=L, k=k, w=w)) self.X_max = 1.0 self.query_params = dict() self.batch_res = None self.sum_query_dur = 0 self.num_queries = 0 es_wait()
class L2Lsh(BaseANN): def __init__(self, L: int, k: int, w: int): self.name_prefix = f"eknn-l2lsh-L={L}-k={k}-w={w}" self.name = None # set based on query args. self.model = ElastiknnModel("lsh", "l2", mapping_params=dict(L=L, k=k, w=w)) self.X_max = 1.0 self.query_params = dict() self.batch_res = None self.sum_query_dur = 0 self.num_queries = 0 es_wait() def fit(self, X): print(f"{self.name_prefix}: indexing {len(X)} vectors") # I found it's best to scale the vectors into [0, 1], i.e. divide by the max. self.X_max = X.max() return self.model.fit(X / self.X_max, shards=1) def set_query_arguments(self, candidates: int, probes: int): # This gets called when starting a new batch of queries. # Update the name and model's query parameters based on the given params. self.name = f"{self.name_prefix}_candidates={candidates}_probes={probes}" self.model.set_query_params(dict(candidates=candidates, probes=probes)) # Reset the counters. self.num_queries = 0 self.sum_query_dur = 0 def query(self, q, n): # If QPS after 100 queries is < 10, this setting is bad and won't complete within the default timeout. if self.num_queries > 100 and self.num_queries / self.sum_query_dur < 10: print( "Throughput after 100 queries is less than 10 q/s. Terminating to avoid wasteful computation.", flush=True) exit(0) else: t0 = perf_counter() res = self.model.kneighbors(np.expand_dims(q, 0) / self.X_max, n)[0] dur = (perf_counter() - t0) self.sum_query_dur += dur self.num_queries += 1 return res def batch_query(self, X, n): self.batch_res = self.model.kneighbors(X, n) def get_batch_results(self): return self.batch_res
def evaluate(dataset: Dataset, eknn: ElastiknnModel): n_neighbors = len(dataset.queries[0].indices) eknn.fit(dataset.corpus, shards=os.cpu_count() - 1) t0 = time() neighbors_pred = eknn.kneighbors([q.vector for q in dataset.queries], allow_missing=True, n_neighbors=n_neighbors) queries_per_sec = len(dataset.queries) / (time() - t0) recalls = [ len(set(q.indices).intersection(p)) / len(q.indices) for (q, p) in zip(dataset.queries, neighbors_pred) ] recall = sum(recalls) / len(recalls) return recall, queries_per_sec
class Exact(BaseANN): def __init__(self, metric: str, dimension: int): self.name = f"eknn-exact-metric={metric}_dimension={dimension}" self.metric = metric self.dimension = dimension self.model = ElastiknnModel("exact", dealias_metric(metric)) self.batch_res = None es_wait() def _handle_sparse(self, X): # convert list of lists of indices to sparse vectors. return [Vec.SparseBool(x, self.dimension) for x in X] def fit(self, X): if self.metric in {'jaccard', 'hamming'}: return self.model.fit(self._handle_sparse(X), shards=1)[0] else: return self.model.fit(X, shards=1) def query(self, q, n): if self.metric in {'jaccard', 'hamming'}: return self.model.kneighbors(self._handle_sparse([q]), n)[0] else: return self.model.kneighbors(np.expand_dims(q, 0), n)[0] def batch_query(self, X, n): if self.metric in {'jaccard', 'hamming'}: self.batch_res = self.model.kneighbors(self._handle_sparse(X), n) else: self.batch_res = self.model.kneighbors(X, n) def get_batch_results(self): return self.batch_res
def test_exact_jaccard_mnist(self): # First run the query and make sure the results have the right form. n_neighbors = 20 model = ElastiknnModel('exact', 'jaccard') model.fit(digits_train) inds1 = model.kneighbors(digits_validate, n_neighbors) inds2, dists2 = model.kneighbors(digits_validate, n_neighbors, return_similarity=True) assert np.all(inds1 == inds2) assert inds1.shape == (digits_validate.shape[0], n_neighbors) assert dists2.shape == inds2.shape # Then compare against scikit-learn. Intentionally using fewer neighbors to make sure recall will be 1 # despite out-of-order indices due to equal distances. ref = NearestNeighbors(n_neighbors=int(n_neighbors / 2), algorithm='brute', metric='jaccard', n_jobs=1) ref.fit(digits_train) inds3 = ref.kneighbors(digits_validate, return_distance=False) # Compute and check the recall. rec = self.recall(inds3, inds2) assert np.all(rec == 1)
def lsh(dataset: Dataset, bands: int = 165, rows: int = 1, candidates: float = 1.5): n_neighbors = len(dataset.queries[0].indices) eknn = ElastiknnModel( algorithm='lsh', metric='jaccard', n_jobs=1, index=f"{INDEX}-{int(time())}", mapping_params={ "bands": bands, "rows": rows }, query_params={"candidates": int(candidates * n_neighbors)}) return evaluate(dataset, eknn)
def indexed(dataset: Dataset): eknn = ElastiknnModel(algorithm='sparse_indexed', metric='jaccard', n_jobs=1, index=f"{INDEX}-{int(time())}") return evaluate(dataset, eknn)
def exact(dataset: Dataset): eknn = ElastiknnModel(algorithm='exact', metric='jaccard', n_jobs=1, index=f"{INDEX}-exact") return evaluate(dataset, eknn)