def speed_test_ms_per_query(index: faiss.Index, query: Optional[np.ndarray] = None, ksearch: int = 40, timout_s: Union[float, int] = 5.0) -> float: """ Evaluate the average speed in milliseconds of the index without using batch """ nb_samples = 2_000 if query is None: query = np.random.rand(nb_samples, index.d).astype("float32") count = 0 nb_repeat = 1 + (nb_samples - 1) // query.shape[0] start_time = time.perf_counter() for one_query in chain.from_iterable(repeat(query, nb_repeat)): _, _ = index.search(np.expand_dims(one_query, 0), ksearch) count += 1 if time.perf_counter() - start_time > timout_s: break return (time.perf_counter() - start_time) / count * 1000.0
def r_recall_at_r_single( query: np.ndarray, ground_truth: np.ndarray, other_index: faiss.Index, r_max: int = 40, eval_item_ids: Optional[np.ndarray] = None, ) -> List[int]: """ Compute an R-recall@R array for each R in range [1, R_max] """ # O(r_max) _, inds = other_index.search(np.expand_dims(query, 0), r_max) res = inds[0] recall_count = [] s_true = set() s_pred = set() tot = 0 for p_true, p_pred in zip(ground_truth[:r_max], res): if eval_item_ids is not None and p_pred != -1: p_pred = eval_item_ids[p_pred] if p_true == p_pred and p_true != -1: tot += 1 else: if p_true in s_pred and p_true != -1: tot += 1 if p_pred in s_true and p_pred != -1: tot += 1 s_true.add(p_true) s_pred.add(p_pred) recall_count.append(tot) return recall_count
def one_recall_at_r_single( query: np.ndarray, ground_truth: np.ndarray, other_index: faiss.Index, r_max: int = 40, eval_item_ids: Optional[np.ndarray] = None, ) -> List[int]: """ Compute an 1-recall@R array for each R in range [1, r_max] for a single query. """ # O(r_max) _, inds = other_index.search(np.expand_dims(query, 0), 1) first = inds[0][0] if eval_item_ids is not None and first != -1: first = eval_item_ids[first] # return empty array if no product is found by other_index if first == -1: return [0 for _ in ground_truth[:r_max]] recall_count = [] seen = False for p_true in ground_truth[:r_max]: if p_true == first: seen = True recall_count.append(1 if seen else 0) return recall_count
def one_recall_at_r( query: np.ndarray, ground_truth: np.ndarray, other_index: faiss.Index, r_max: int = 40, eval_item_ids: Optional[np.ndarray] = None, ) -> List[float]: """ Compute an 1-recall@R array for each R in range [1, r_max] """ # O(r_max) if r_max <= 0: return np.zeros((0, )) _, first = other_index.search(query, 1) if eval_item_ids is not None: first = np.vectorize(lambda e: eval_item_ids[e] if e != -1 else -1)( first) # type: ignore recall_array = np.cumsum( (ground_truth[:, :r_max] == first) & (first != -1), axis=-1) avg_recall = np.mean(recall_array, axis=0) return avg_recall
def _dist_mz_interval( index: faiss.Index, vectors: np.ndarray, precursor_mzs: np.ndarray, batch_size: int, n_neighbors: int, n_neighbors_ann: int, precursor_tol_mass: float, precursor_tol_mode: str, distances: np.ndarray, indices: np.ndarray, indptr: np.ndarray, indptr_i: int) -> None: """ Compute distances to the nearest neighbors for the given precursor m/z interval. Parameters ---------- index : faiss.Index The NN index used to efficiently find distances to similar spectra. vectors : np.ndarray The spectrum vectors to be queried against the NN index. precursor_mzs : np.ndarray Precorsor m/z's of the spectra corresponding to the given vectors. batch_size : int The number of vectors to be simultaneously queried. n_neighbors : int The final (maximum) number of neighbors to retrieve for each vector. n_neighbors_ann : int The number of neighbors to retrieve using the ANN index. This can exceed the final number of neighbors (`n_neighbors`) to maximize the number of neighbors within the precursor m/z tolerance. precursor_tol_mass : float The precursor tolerance mass for vectors to be considered as neighbors. precursor_tol_mode : str The unit of the precursor m/z tolerance ('Da' or 'ppm'). distances : np.ndarray The nearest neighbor distances. See `scipy.sparse.csr_matrix` (`data`). indices : np.ndarray The column indices for the nearest neighbor distances. See `scipy.sparse.csr_matrix`. indptr : np.ndarray The index pointers for the nearest neighbor distances. See `scipy.sparse.csr_matrix`. indptr_i : int The current start index in `indptr`. """ for batch_start in range(0, vectors.shape[0], batch_size): batch_stop = min(batch_start + batch_size, index.ntotal) # Find nearest neighbors using ANN index searching. # noinspection PyArgumentList nn_dists, nn_idx_ann = index.search(vectors[batch_start:batch_stop], n_neighbors_ann) # Filter the neighbors based on the precursor m/z tolerance and assign # distances. _filter_neighbors_mz( precursor_mzs, batch_start, batch_stop, precursor_tol_mass, precursor_tol_mode, nn_dists, nn_idx_ann, n_neighbors, distances, indices, indptr, indptr_i + batch_start)
def search_speed_test(index: faiss.Index, query: Optional[np.ndarray] = None, ksearch: int = 40, timout_s: Union[float, int] = 10.0) -> Dict[str, float]: """ return the average and 99p search speed """ nb_samples = 2_000 if query is None: query = np.random.rand(nb_samples, index.d).astype("float32") test_start_time_s = time.perf_counter() speed_list_ms = [] # in milliseconds nb_repeat = 1 + (nb_samples - 1) // query.shape[0] for one_query in chain.from_iterable(repeat(query, nb_repeat)): start_time_s = time.perf_counter() # high precision _, _ = index.search(np.expand_dims(one_query, 0), ksearch) end_time_s = time.perf_counter() search_time_ms = 1000.0 * (end_time_s - start_time_s) speed_list_ms.append(search_time_ms) if time.perf_counter() - test_start_time_s > timout_s: break speed_list_ms = np.array(speed_list_ms) print(len(speed_list_ms)) # avg2 = 1000 * (time.perf_counter() - test_start_time_s) / len(speed_list_ms) speed_infos = { "avg_search_speed_ms": np.average(speed_list_ms), "99p_search_speed_ms": np.quantile(speed_list_ms, 0.99), } return speed_infos
def knn( index: faiss.Index, embedding: np.ndarray, labels2captions: Union[np.ndarray, Dict], top_k: int = 3, k: int = 1, ): """ Performs kNN on factory index + index with `name`. Args: index (faiss.Index): Index object embedding (np.ndarray): Embeddings query. top_k (int): Top K results to return. k (int): K parameter in kNN. labels2captions (Dict[int, str]): Returns List[Dict]: Closest neighbors. """ results = [] # Search for closest embeddings in terms of inner product distance nn_distances, nn_labels = index.search( embedding[np.newaxis, ...], k=index.ntotal) nn_distances = np.clip(nn_distances, 0.0, 1.0) nn_distances = np.arccos(nn_distances) nn_distances = np.squeeze(nn_distances) nn_labels = np.squeeze(nn_labels) true_label = None top_k = min(top_k, len(np.unique(nn_labels))) for _ in range(top_k): if true_label is not None: not_equal_indcs = np.where(nn_labels != true_label)[0] nn_labels = nn_labels[not_equal_indcs] nn_distances = nn_distances[not_equal_indcs] # Tale first k neighbor classes if nn_labels.ndim == 0: true_label = int(nn_labels) true_caption = labels2captions[true_label] closest_distance = float(nn_distances) else: knn_labels = nn_labels[:k] # Find most frequent from them true_label = mode(knn_labels, axis=0)[0][0] true_caption = labels2captions[true_label] closest_index = nn_labels.tolist().index(true_label) closest_distance = nn_distances[closest_index] result = { "label": true_label, "caption": true_caption, "distance": closest_distance, } results.append(result) return results
def gen_stats(self, dataset_idx: OrderedDict, dataset_doc_ids: [], dataset_embeddings: np.ndarray, faiss_index: faiss.Index, annos: dict, distance_threshold: float) -> (dict, dict, dict): """ Generate count stats from the dataset, grouped by different annotation types :param dataset_idx: The dictionary to map document id to the index in dataset_doc_ids :param dataset_doc_ids: Array of document ids :param dataset_embeddings: Numpy array of document embeddings :param faiss_index: Faiss index :param annos: The dictionary to map each annotation type to a set of document ids. :param distance_threshold: A threhold to exclude dislike query results :return: A dictionary group document ids by annotation type, a dictionary group not sampled document ids by annotation type, a stats counts for each annotation type. """ distances = { type_name: dict() for type_name in self.grouped_ids.keys() } print(type(dataset_embeddings)) max_query_res = int(len(dataset_embeddings) * 0.8) if max_query_res > self.MAX_QUERY_RES: max_query_res = self.MAX_QUERY_RES print('Querying similar document embeddings...') for type_name, doc_ids in annos.items(): subset_embeddings = np.array([ dataset_embeddings[dataset_idx[doc_id]] for doc_id in doc_ids ]) for i in range(0, len(subset_embeddings)): res_distances, res_doc_idx_ids = faiss_index.search( subset_embeddings[i:i + 1], max_query_res) for j in range(0, len(res_distances[0])): res_d = res_distances[0][j] if res_d > distance_threshold: break doc_id = dataset_doc_ids[res_doc_idx_ids[0][j]] self.grouped_ids[type_name].add(doc_id) # update the distances of a candidate doc to the closest doc in the reviewed documents if doc_id not in distances[ type_name] or res_d < distances[type_name][doc_id]: distances[type_name][doc_id] = res_d # solve overlapping candidates print('Solve overlapping candidates...') for doc_id in dataset_doc_ids: shortest_distance = 10000 to_remove_from_types = [] previous_type = '' for type_name in distances.keys(): if doc_id in distances[type_name] and distances[type_name][ doc_id] < shortest_distance: shortest_distance = distances[type_name][doc_id] if previous_type != '': to_remove_from_types.append(type_name) previous_type = type_name for type_name in to_remove_from_types: self.grouped_ids[type_name].remove(doc_id) available_outscope_ids = set(dataset_doc_ids) # identify the documents haven't been reviewed print("identify the documents haven't been reviewed") for type_name, doc_ids in self.grouped_ids.items(): available_outscope_ids = available_outscope_ids - doc_ids self.new_ids[type_name] = doc_ids - self.previous_sampled_ids self.current_stats = { 'all_counts': { type_name: len(value) for type_name, value in self.grouped_ids.items() }, 'new_counts': { type_name: len(value) for type_name, value in self.new_ids.items() } } self.available_not_contain = len(available_outscope_ids) self.current_stats['all_counts'][ 'not_contain'] = self.available_not_contain self.new_available_not_contain = len(available_outscope_ids - self.previous_sampled_ids) self.current_stats['new_counts'][ 'not_contain'] = self.new_available_not_contain return self.grouped_ids, self.new_ids, self.current_stats