def get_nearest_hits(centers: np.array, index: faiss.IndexFlatL2, k_neighbours: int) -> np.array: """Returns k nearest neighbours from specified index for each center""" d, ids = index.search(centers, k_neighbours) all_hits = np.array([index.reconstruct(int(i)) for i in ids.flatten()]) #maybe need to use array of hits on station and build index right here return all_hits
def rips_graph(point_cloud, k): point_cloud = point_cloud.astype('float32') _, dim = point_cloud.shape cpuindex = IndexFlatL2(dim) cpuindex.add(point_cloud) return cpuindex.search(point_cloud, k)
def compute_rips_np(pc: np.ndarray, k: int) -> tuple[np.ndarray, np.ndarray]: """"Compute the delta-Rips Graph.""" pc = pc.astype(np.float32) cpuindex = IndexFlatL2(pc.shape[1]) cpuindex.add(pc) return cpuindex.search(pc, k)
def most_representive_patches(self, anchors, odata): res_anchor=[] features = [] for i, o in enumerate(odata): rlbp = self.rLBP.describe(o[0]) glbp = self.gLBP.describe(o[1]) blbp = self.bLBP.describe(o[2]) acs = anchors[i] acs = [self.rate*ac for ac in acs] f = [] for ac in acs: r = rlbp[ac[0]:ac[2], ac[1]:ac[3]] vr = self.rLBP.gethist(r) g = glbp[ac[0]:ac[2], ac[1]:ac[3]] vg = self.gLBP.gethist(g) b = blbp[ac[0]:ac[2], ac[1]:ac[3]] vb = self.bLBP.gethist(b) v = np.concatenate([vr,vg,vb],0) f.append(v) f = np.array(f).astype('float32') kmeans = Kmeans(f.shape[1],10) Kmeans.gpu = True kmeans.train(f) index = IndexFlatL2(f.shape[1]) index.add(f) D,I = index.search(kmeans.centroids, 1) I = np.squeeze(I,1) print(I) fis = [f[index] for index in I] boxes = [acs[index] for index in I] res_anchor.append(boxes) features.append(fis) return res_anchor, features
def rips_graph(point_cloud, delta): point_cloud = point_cloud.astype('float32') _, dim = point_cloud.shape index = IndexFlatL2(dim) index.add(point_cloud) return index.range_search(point_cloud, delta)
def compute_density_map_np(x: np.ndarray, k: int, scale: float) -> tuple[np.ndarray, np.ndarray]: """Compute the k-nearest neighbours kernel density estimate.""" x = x.astype(np.float32) index = IndexFlatL2(x.shape[1]) index.add(x) values, indexes = index.search(x, k) result = np.sum(np.exp(-values / scale), axis=1) / (k * scale) return result / max(result), indexes
def __init__(self, matrix=None, path_csv='', n_neighbors=5, is_faiss=True, metric='minkowski', p=2, algorithm='auto', leaf_size = 30): """ Parameters ---------- matrix: np.array, default None Matrix of elements (nxd) path_csv: string, default '' If matrix is None, matrix would be read from path n_neighbors : int, optional (default = 5) Number of neighbors to use by default metric : string or callable, default 'minkowski' metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. If metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays as input and return one value indicating the distance between them. This works for Scipy's metrics, but is less efficient than passing the metric name as a string. p : integer, optional (default = 2) Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. algorithm based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force. leaf_size : int, optional (default = 30) Leaf size passed to BallTree or KDTree. """ self.n_neighbors = n_neighbors if path_csv != '': matrix = pd.read_csv(path_csv) elif matrix is None: # Plz correct my English raise ValueError('Not matrix not path is provided ') if is_faiss: from faiss import IndexFlatL2 index = IndexFlatL2(matrix.shape[1]) index.add(matrix) self.nearest_distances, self.nearest_indices = index.search(matrix, n_neighbors + 1) self.nearest_distances, self.nearest_indices = self.nearest_distances[:, 1:], self.nearest_indices[:, 1:] else: self.nearest_distances, self.nearest_indices = \ NearestNeighbors(n_neighbors=n_neighbors+1, algorithm=algorithm, metric=metric, leaf_size=leaf_size, p=p, n_jobs=-1).fit(matrix).kneighbors(matrix) self.nearest_distances = self.nearest_distances[:, 1:] ** 2 self.nearest_indices = self.nearest_indices[:, 1:]
def approximate(pc, r, tau): pc = pc.astype('float32') size = len(pc) index = IndexFlatL2(len(pc[0])) index.add(pc) D, I = index.search(pc, r) result = np.sum(np.exp(-D / tau), axis=1) / (r * tau) return result / max(result), I #/ max(result)
def main(args): # https://github.com/facebookresearch/faiss/blob/a8118acbc516b0263dde610862c806400cc48bf5/Clustering.cpp#L78-L80 if args.num_train < max(kNumCells, kNumCentroids): sys.exit("Error: require at least {} training samples".format( max(kNumCells, kNumCentroids))) dataset = FeatureDirectory(root=args.features) train_dataset, index_dataset = random_split( dataset, [args.num_train, len(dataset) - args.num_train]) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) index_loader = DataLoader(index_dataset, batch_size=args.batch_size, num_workers=args.num_workers) N, C = len(train_dataset), 2048 train_features = np.empty(shape=(N, C), dtype=np.float32) for i, (features, paths) in enumerate( tqdm(train_loader, desc="Train", unit="batch", ascii=True)): train_features[i * args.batch_size:i * args.batch_size + args.batch_size] = features quantizer = IndexFlatL2(C) index = IndexIVFPQ(quantizer, C, kNumCells, kNumCentroids, kNumBitsPerIdx) index.do_polysemous_training = True print("Training index on training features", file=sys.stderr) index.train(train_features) metadata = [] for features, paths in tqdm(index_loader, desc="Index", unit="batch", ascii=True): features = np.ascontiguousarray(features) index.add(features) for path in paths: fname = Path(path).name metadata.append(fname) print('Saving index file and metadata') IndexIO.save(args.index.with_suffix(".idx"), index) JsonIO.save(args.index.with_suffix(".json"), metadata)
def predict(index: faiss.IndexFlatL2, embeddings: np.ndarray, y_train: np.ndarray, k_neighbors: int) -> np.ndarray: # Searching using nearest neighbors in reference set _, indices = index.search( embeddings, k_neighbors) # indices: shape [N_embeddings x k_neighbors] # Get labels of found indices y_pred: np.ndarray = y_train[ indices] # shape: [N_embeddings x k_neighbors] # Labels that appears most are predicted labels # https://en.wikipedia.org/wiki/Mode_(statistics) y_pred = mode(y_pred, axis=1)[0].ravel() # shape: [N_embedding] return y_pred
def query( index: faiss.IndexFlatL2, query_embedding: np.ndarray, k_queries: int, ref_image_paths: List[str], ) -> Tuple[List[str], List[int], List[float]]: # Searching using nearest neighbors in reference set # indices: shape [N_embeddings x k_queries] # distances: shape [N_embeddings x k_queries] distances, indices = index.search(query_embedding, k_queries) indices: List[int] = indices.ravel().tolist() distances: List[float] = distances.ravel().tolist() image_paths: List[str] = [ref_image_paths[i] for i in indices] return image_paths, indices, distances
def knn(queries: torch.Tensor, index: faiss.IndexFlatL2, keys_store: np.ndarray, vals_store: np.ndarray, n_tokens: int): """ ## $k$-NN to get $p(w_t, c_t)$ Here we refer to $f(\textcolor{yellowgreen}{c_t})$ as queries, $f(c_i)$ as keys and $w_i$ as values. """ # Save shape of queries to reshape results queries_shape = queries.shape # Flatten the `batch` and `sequence` dimensions of queries queries = queries.view(-1, queries_shape[-1]) # Find 10 nearest neighbors of $f(\textcolor{yellowgreen}{c_t})$ among $f(c_i)$. # `distance` is the distance given by FAISS and `idx`, $i$ is the index of it in `keys_store`. distance, idx = index.search(queries.numpy(), 10) # Get $f(c_i)$ keys_found = queries.new_tensor(keys_store[idx]) # Get $w_i$ vals_found = torch.tensor(vals_store[idx]).squeeze(-1) # We are going to calculate the cosine similarity between normalized vectors # Normalize $f(c_i)$ keys_found_n = keys_found / torch.sqrt( (keys_found**2).sum(-1, keepdims=True) + 1e-10) # Normalize $f(\textcolor{yellowgreen}{c_t})$ queries_n = queries / torch.sqrt((queries**2).sum(-1, keepdims=True) + 1e-10) # Get the dot-product, or cosine similarity dot_prod = (keys_found_n * queries_n.unsqueeze(1)).sum(-1) # Token-wise logits logits_token = dot_prod.new_zeros(queries.shape[0], n_tokens) # Scatter and accumulate token logits based on the nearest neighbors _ = logits_token.scatter_(dim=1, index=vals_found, src=dot_prod, reduce='add') # Reshape the logits logits_token = logits_token.reshape(queries_shape[0], queries_shape[1], -1) return logits_token
def calculate_precision_at_k(index: faiss.IndexFlatL2, X_test: np.ndarray, y_test: np.ndarray, y_train: np.ndarray, k: int) -> float: # Searching using nearest neighbors in train set _, indices = index.search(X_test, k) # indices: shape [N_embeddings x k_neighbors] y_pred = [] for i in range(k): indices_at_k: np.ndarray = indices[:, i] # [M] y_pred_at_k: np.ndarray = y_train[indices_at_k][:, None] # [M x 1] y_pred.append(y_pred_at_k) y_pred: np.ndarray = np.hstack(y_pred) # [M x k] y_test = np.hstack((y_test[:, None], ) * k) # [M x k] precision_at_k: float = ( (y_pred == y_test).sum(axis=1) / k).mean().item() * 100 return precision_at_k
def calculate_topk_accuracy(index: faiss.IndexFlatL2, X_test: np.ndarray, y_test: np.ndarray, y_train: np.ndarray, top_k: int) -> float: # Search using nearest neighbors in train set _, indices = index.search( X_test, top_k) # indices: shape [N_embeddings x k_neighbors] y_pred = [] for i in range(top_k): indices_at_k: np.ndarray = indices[:, i] # [M] y_pred_at_k: np.ndarray = y_train[indices_at_k][:, None] # [M x 1] y_pred.append(y_pred_at_k) y_pred: np.ndarray = np.hstack(y_pred) # [M x k] y_test = np.hstack((y_test[:, None], ) * top_k) # [M x k] n_predictions: int = y_pred.shape[0] n_true_predictions: int = ((y_pred == y_test).sum(axis=1) > 0).sum().item() topk_accuracy: float = n_true_predictions / n_predictions * 100 return topk_accuracy
def approximate(pc, r): """ Point Cloud を生成する確率分布の密度関数を f としたとき, f の近似 f' に対して f'(pc) を返す Parameters ---------- pc : ndarray of float32 Point Cloud r : float Returns ------- approximated_values : ndarray of float32 Point Cloud の各点における近似関数の値 """ pc = pc.astype('float32') size = len(pc) index = IndexFlatL2(len(pc[0])) index.add(pc) lim = index.range_search(pc, r)[0] return array([lim[i + 1] - lim[i] for i in range(size)]) / (size * 2 * r)
def initalize_prodige(X, knn_edges=64, random_edges=32, verbose=False, **kwargs): # Creates graph embedding from an object-feature matrix, # initialize weights with squared euclidian distances |x_i - x_j|^2_2 # 2 types of edges: # knn edges - connecting vertices to their nearest neighbors # Random edges - connecting random pairs of vertices to get smallworld property # X: matrix of samples # See Section 3.3: Scalability # knn_edges: edges per vertex to X nearest neighbours via FAISS or sklearn # random_edges: X random edges per vertex for smallworld property # kwargs: other args sent to GraphEmbedding() # returns: Initialized GraphEmbedding num_vectors, vector_dim = X.shape X = np.require(X, dtype=np.float32, requirements=['C_CONTIGUOUS']) if verbose: print("Searching for nearest neighbors") try: from faiss import IndexFlatL2 index = IndexFlatL2(vector_dim) index.add(X) neighbor_distances, neighbor_indices = index.search(X, knn_edges + 1) except ImportError: print("faiss not found, using slow knn instead") neighbor_distances, neighbor_indices = NearestNeighbors(n_neighbors=knn_edges + 1).fit(X).kneighbors(X) if verbose: print("Adding knn edges") edges_from, edges_to, distances = [], [], [] for vertex_i in np.arange(num_vectors): for neighbor_i, distance in zip(neighbor_indices[vertex_i], neighbor_distances[vertex_i]): if vertex_i == neighbor_i: continue # prevent loops if neighbor_i == -1: continue # padding distance **= 0.5 edges_from.append(vertex_i) edges_to.append(neighbor_i) distances.append(distance) if random_edges != 0: if verbose: print("Adding random edges") random_from = np.random.randint(0, num_vectors, num_vectors * random_edges) random_to = np.random.randint(0, num_vectors, num_vectors * random_edges) for vertex_i, neighbor_i in zip(random_from, random_to): if vertex_i != neighbor_i: distance = np.sum((X[vertex_i] - X[neighbor_i]) ** 2) ** 0.5 edges_from.append(vertex_i) edges_to.append(neighbor_i) distances.append(distance) if verbose: print("Deduplicating edges") # remove duplicate edges and add them again at random unique_edges_dict = {} for from_i, to_i, distance in zip(edges_from, edges_to, distances): edge_iijj = int(from_i), int(to_i) edge_iijj = tuple(sorted(edge_iijj)) unique_edges_dict[edge_iijj] = distance edges_iijj, distances = zip(*unique_edges_dict.items()) edges_from, edges_to = zip(*edges_iijj) edges_from, edges_to, distances = map(np.asanyarray, [edges_from, edges_to, distances]) if verbose: print("Total edges: {}, mean edges per vertex: {}, mean distance: {}".format( len(edges_from), len(edges_from) / float(num_vectors), np.mean(distances) )) return GraphEmbedding(edges_from, edges_to, weights=distances, **kwargs)
def main(args): # https://github.com/facebookresearch/faiss/blob/a8118acbc516b0263dde610862c806400cc48bf5/Clustering.cpp#L78-L80 if args.num_train < max(kNumCells, kNumCentroids): sys.exit("Error: require at least {} training samples".format( max(kNumCells, kNumCentroids))) extract = FeatureExtractor(image_size=args.image_size) dataset = ImageDirectory(root=args.frames, transform=extract.transform) train_dataset, index_dataset = random_split( dataset, [args.num_train, len(dataset) - args.num_train]) if len(train_dataset) > len(index_dataset) or len( train_dataset) > 0.25 * len(index_dataset): sys.exit("Error: training dataset too big: train={}, index={}".format( len(train_dataset), len(index_dataset))) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) index_loader = DataLoader(index_dataset, batch_size=args.batch_size, num_workers=args.num_workers) N, C = len(train_dataset), 2048 train_features = np.empty(shape=(N, C), dtype=np.float32) for i, (images, paths) in enumerate( tqdm(train_loader, desc="Train", unit="batch", ascii=True)): n, h, w = images.size(0), images.size(2), images.size(3) features = extract(images) features = features.data.cpu().numpy() # resnet5 downsamples x2 five times h, w = h // 32, w // 32 # MAC feature features = reduce(features, "n (h w) c -> n c", "max", n=n, h=h, w=w, c=C) train_features[i * args.batch_size:i * args.batch_size + n] = features quantizer = IndexFlatL2(C) index = IndexIVFPQ(quantizer, C, kNumCells, kNumCentroids, kNumBitsPerIdx) index.do_polysemous_training = True print("Training index on training features", file=sys.stderr) index.train(train_features) metadata = [] for images, paths in tqdm(index_loader, desc="Index", unit="batch", ascii=True): n, h, w = images.size(0), images.size(2), images.size(3) # resnet5 downsamples x2 five times h, w = h // 32, w // 32 # MAC feature descriptor features = extract(images) features = reduce(features, "n (h w) c -> n c", "max", n=n, h=h, w=w, c=C) features = features.data.cpu().numpy() # C-array required for faiss FFI: tensors might not be contiguous features = np.ascontiguousarray(features) # Add a batch of (batch*49, 2048) unpooled features to the index at once index.add(features) for path in paths: fname = Path(path).name metadata.append(fname) IndexIO.save(args.index.with_suffix(".idx"), index) JsonIO.save(args.index.with_suffix(".json"), metadata)
def make_graph_from_vectors(X, *, knn_edges, random_edges=0, virtual_vertices=0, deduplicate=True, directed=True, verbose=False, squared=False, GraphEmbeddingClass=GraphEmbedding, **kwargs): """ Creates graph embedding from an object-feature matrix, initialize weights with squared euclidian distances |x_i - x_j|^2_2 The graph consists of three types of edges: * knn edges - connecting vertices to their nearest neighbors * random edges - connecting random pairs of vertices to get smallworld property * edges to virtual_vertices - adds synthetic vertices to task and connect with all other vertices (init with k-means) :param X: task matrix[num_vertors, vector_dim] :param knn_edges: connects vertex to this many nearest neighbors :param random_edges: adds this many random edges per vertex (long edges for smallworld property) :param virtual_vertices: adds this many new vertices connected to all points, initialized as centroids :param deduplicate: if enabled(default), removes all duplicate edges (e.g. if the edge was first added via :m:, and then added again via :random_rate: :param directed: if enabled, treats (i, j) and (j, i) as the same edge :param verbose: if enabled, prints progress into stdout :param squared: if True, uses squared euclidian distance, otherwise normal euclidian distance :param kwargs: other keyword args sent to :GraphEmbedding.__init__: :rtype: GraphEmbedding """ num_vectors, vector_dim = X.shape X = np.require(check_numpy(X), dtype=np.float32, requirements=['C_CONTIGUOUS']) if virtual_vertices != 0: if verbose: print("Creating virtual vertices by k-means") X_clusters = KMeans(virtual_vertices).fit(X).cluster_centers_ X = np.concatenate([X, X_clusters]) if verbose: print("Searching for nearest neighbors") try: from faiss import IndexFlatL2 index = IndexFlatL2(vector_dim) index.add(X) neighbor_distances, neighbor_indices = index.search(X, knn_edges + 1) except ImportError: warn("faiss not found, using slow knn instead") neighbor_distances, neighbor_indices = NearestNeighbors( n_neighbors=knn_edges + 1).fit(X).kneighbors(X) if not squared: neighbor_distances **= 0.5 if verbose: print("Adding knn edges") edges_from, edges_to, distances = [], [], [] for vertex_i in np.arange(num_vectors): for neighbor_i, distance in zip(neighbor_indices[vertex_i], neighbor_distances[vertex_i]): if vertex_i == neighbor_i: continue # forbid loops if neighbor_i == -1: continue # ANN engine uses -1 for padding edges_from.append(vertex_i) edges_to.append(neighbor_i) distances.append(distance) if random_edges != 0: if verbose: print("Adding random edges") random_from = np.random.randint(0, num_vectors, num_vectors * random_edges) random_to = np.random.randint(0, num_vectors, num_vectors * random_edges) for vertex_i, neighbor_i in zip(random_from, random_to): if vertex_i != neighbor_i: distance = np.sum((X[vertex_i] - X[neighbor_i])**2) if not squared: distance **= 0.5 edges_from.append(vertex_i) edges_to.append(neighbor_i) distances.append(distance) if deduplicate: if verbose: print("Deduplicating edges") unique_edges_dict = {} # {(from_i, to_i) : distance(i, j)} for from_i, to_i, distance in zip(edges_from, edges_to, distances): edge_iijj = int(from_i), int(to_i) if not directed: edge_iijj = tuple(sorted(edge_iijj)) unique_edges_dict[edge_iijj] = distance edges_iijj, distances = zip(*unique_edges_dict.items()) edges_from, edges_to = zip(*edges_iijj) edges_from, edges_to, distances = map(np.asanyarray, [edges_from, edges_to, distances]) if verbose: print("Total edges: {}, mean edges per vertex: {}, mean distance: {}". format(len(edges_from), len(edges_from) / float(num_vectors), np.mean(distances))) return GraphEmbeddingClass(edges_from, edges_to, initial_weights=distances, directed=directed, **kwargs)
def main(args): # https://github.com/facebookresearch/faiss/blob/a8118acbc516b0263dde610862c806400cc48bf5/Clustering.cpp#L78-L80 if args.num_train < max(args.num_centroids, args.code_size): sys.exit("💥 Require at least {} training samples".format( max(args.num_centroids, args.code_size))) paths = [path for path in args.features.iterdir() if path.is_file()] print( "� Loading clip features from {} video feature files at {}".format( len(paths), args.features), file=sys.stderr) # Two passes over videos of arbitrary number of clips: # - First pass reservoir samples clip features, trains index # - Second pass adds remaining clip features to index # # This way we can properly randomly select train samples and # at the same time keep our peak memory consumption reasonable. train_samples = StreamSampler(args.num_train) # 1st pass total_clips = 0 for i, path in enumerate(tqdm(paths)): feats = np.load(path, allow_pickle=False) assert len(feats.shape) == 2 assert feats.shape[1] == args.dimension assert feats.dtype == np.float32 for j, feat in enumerate(feats): # Keep train and index datasets disjoint # Track train clips: ith video, jth clip train_samples.add((feat, (i, j))) total_clips += 1 if len(train_samples) < args.num_train: sys.exit( "💥 Not enough samples in dataset to train on; loaded {}".format( len(train_samples))) train_feats = [k for k, _ in train_samples] train_clips = {v for _, v in train_samples} train_feats = np.array(train_feats) assert train_feats.shape == (args.num_train, args.dimension) assert train_feats.dtype == np.float32 quantizer = IndexFlatL2(args.dimension) index = IndexIVFPQ(quantizer, args.dimension, args.num_centroids, args.code_size, args.num_bits) print( "🚄 Training index on {} out of {} total {}-dimensional clip features" .format(args.num_train, total_clips, args.dimension), file=sys.stderr) index.train(train_feats) del train_feats del train_samples # 2nd pass assert index.is_trained print( "🔖 Adding to index {} out of {} total {}-dimensional clip features". format(total_clips - len(train_clips), total_clips, args.dimension), file=sys.stderr) metadata = [] batch_feats = [] for i, path in enumerate(tqdm(paths)): feats = np.load(path, allow_pickle=False) assert len(feats.shape) == 2 assert feats.shape[1] == args.dimension assert feats.dtype == np.float32 for j, feat in enumerate(feats): if (i, j) in train_clips: continue batch_feats.append(feat) # Could be more efficient than one entry per clip. # This way it's simple to use in the client for now. metadata.append({"path": str(path), "clip": j}) if len(batch_feats) % args.batch_size == 0: feats = np.array(batch_feats) batch_feats.clear() index.add(feats) if batch_feats: feats = np.array(batch_feats) batch_feats.clear() index.add(feats) assert index.ntotal == total_clips - len(train_clips) write_index(index, str(args.index.with_suffix(".idx"))) with args.index.with_suffix(".json").open("w") as fp: json.dump(metadata, fp) print("📖 Done", file=sys.stderr)