Beispiel #1
0
def get_nearest_hits(centers: np.array, index: faiss.IndexFlatL2,
                     k_neighbours: int) -> np.array:
    """Returns k nearest neighbours
	from specified index for each center"""

    d, ids = index.search(centers, k_neighbours)
    all_hits = np.array([index.reconstruct(int(i)) for i in ids.flatten()])
    #maybe need to use array of hits on station and build index right here

    return all_hits
Beispiel #2
0
def rips_graph(point_cloud, k):
    point_cloud = point_cloud.astype('float32')
    _, dim = point_cloud.shape
    cpuindex = IndexFlatL2(dim)
    cpuindex.add(point_cloud)

    return cpuindex.search(point_cloud, k)
Beispiel #3
0
def compute_rips_np(pc: np.ndarray, k: int) -> tuple[np.ndarray, np.ndarray]:
    """"Compute the delta-Rips Graph."""
    pc = pc.astype(np.float32)
    cpuindex = IndexFlatL2(pc.shape[1])
    cpuindex.add(pc)

    return cpuindex.search(pc, k)
Beispiel #4
0
    def most_representive_patches(self, anchors, odata):
        res_anchor=[]
        features = []
        for i, o in enumerate(odata):
            rlbp = self.rLBP.describe(o[0])
            glbp = self.gLBP.describe(o[1])
            blbp = self.bLBP.describe(o[2])
            acs = anchors[i]
            acs = [self.rate*ac for ac in acs]
            f = []
            for ac in acs:
                r = rlbp[ac[0]:ac[2], ac[1]:ac[3]]
                vr = self.rLBP.gethist(r)
                g = glbp[ac[0]:ac[2], ac[1]:ac[3]]
                vg = self.gLBP.gethist(g)
                b = blbp[ac[0]:ac[2], ac[1]:ac[3]]
                vb = self.bLBP.gethist(b)
                v = np.concatenate([vr,vg,vb],0)
                f.append(v)
            f = np.array(f).astype('float32')

            kmeans = Kmeans(f.shape[1],10)
            Kmeans.gpu = True
            kmeans.train(f)
            index = IndexFlatL2(f.shape[1])
            index.add(f)
            D,I = index.search(kmeans.centroids, 1)
            I = np.squeeze(I,1)
            print(I)
            fis = [f[index] for index in I]
            boxes = [acs[index] for index in I]
            res_anchor.append(boxes)
            features.append(fis)
        return res_anchor, features
Beispiel #5
0
def rips_graph(point_cloud, delta):
    point_cloud = point_cloud.astype('float32')
    _, dim = point_cloud.shape
    index = IndexFlatL2(dim)
    index.add(point_cloud)

    return index.range_search(point_cloud, delta)
Beispiel #6
0
def compute_density_map_np(x: np.ndarray, k: int,
                           scale: float) -> tuple[np.ndarray, np.ndarray]:
    """Compute the k-nearest neighbours kernel density estimate."""
    x = x.astype(np.float32)
    index = IndexFlatL2(x.shape[1])
    index.add(x)
    values, indexes = index.search(x, k)
    result = np.sum(np.exp(-values / scale), axis=1) / (k * scale)
    return result / max(result), indexes
Beispiel #7
0
    def __init__(self, matrix=None, path_csv='', n_neighbors=5,
                 is_faiss=True, metric='minkowski', p=2, algorithm='auto', leaf_size = 30):
        """
        Parameters
        ----------
        matrix: np.array, default None 
                Matrix of elements (nxd)
        path_csv: string, default ''
                If matrix is None, matrix would be read from path 
        n_neighbors : int, optional (default = 5)
                Number of neighbors to use by default
            
        metric : string or callable, default 'minkowski'
                metric to use for distance computation. Any metric from scikit-learn
                or scipy.spatial.distance can be used.

                If metric is a callable function, it is called on each
                pair of instances (rows) and the resulting value recorded. The callable
                should take two arrays as input and return one value indicating the
                distance between them. This works for Scipy's metrics, but is less
                efficient than passing the metric name as a string.
        p : integer, optional (default = 2)
            Parameter for the Minkowski metric from
            sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
            equivalent to using manhattan_distance (l1), and euclidean_distance
            (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
        algorithm
            based on the values passed to :meth:`fit` method.

            Note: fitting on sparse input will override the setting of
            this parameter, using brute force.
        leaf_size : int, optional (default = 30)
            Leaf size passed to BallTree or KDTree.
        """
        
        self.n_neighbors = n_neighbors
        
        if path_csv != '':
            matrix = pd.read_csv(path_csv)
            
        elif matrix is None:
            # Plz correct my English
            raise ValueError('Not matrix not path is provided ')
            
        if is_faiss:
            from faiss import IndexFlatL2
            index = IndexFlatL2(matrix.shape[1])
            index.add(matrix)
            self.nearest_distances, self.nearest_indices = index.search(matrix, n_neighbors + 1)
            self.nearest_distances, self.nearest_indices = self.nearest_distances[:, 1:], self.nearest_indices[:, 1:]
        else:
            self.nearest_distances, self.nearest_indices = \
                     NearestNeighbors(n_neighbors=n_neighbors+1, algorithm=algorithm,
                                      metric=metric, leaf_size=leaf_size,
                                      p=p, n_jobs=-1).fit(matrix).kneighbors(matrix)
            self.nearest_distances = self.nearest_distances[:, 1:] ** 2
            self.nearest_indices = self.nearest_indices[:, 1:]
Beispiel #8
0
def approximate(pc, r, tau):

    pc = pc.astype('float32')
    size = len(pc)
    index = IndexFlatL2(len(pc[0]))
    index.add(pc)
    D, I = index.search(pc, r)
    result = np.sum(np.exp(-D / tau), axis=1) / (r * tau)
    return result / max(result), I  #/ max(result)
Beispiel #9
0
def main(args):
    # https://github.com/facebookresearch/faiss/blob/a8118acbc516b0263dde610862c806400cc48bf5/Clustering.cpp#L78-L80
    if args.num_train < max(kNumCells, kNumCentroids):
        sys.exit("Error: require at least {} training samples".format(
            max(kNumCells, kNumCentroids)))

    dataset = FeatureDirectory(root=args.features)
    train_dataset, index_dataset = random_split(
        dataset,
        [args.num_train, len(dataset) - args.num_train])

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              num_workers=args.num_workers)
    index_loader = DataLoader(index_dataset,
                              batch_size=args.batch_size,
                              num_workers=args.num_workers)

    N, C = len(train_dataset), 2048

    train_features = np.empty(shape=(N, C), dtype=np.float32)

    for i, (features, paths) in enumerate(
            tqdm(train_loader, desc="Train", unit="batch", ascii=True)):

        train_features[i * args.batch_size:i * args.batch_size +
                       args.batch_size] = features

    quantizer = IndexFlatL2(C)

    index = IndexIVFPQ(quantizer, C, kNumCells, kNumCentroids, kNumBitsPerIdx)
    index.do_polysemous_training = True

    print("Training index on training features", file=sys.stderr)
    index.train(train_features)

    metadata = []

    for features, paths in tqdm(index_loader,
                                desc="Index",
                                unit="batch",
                                ascii=True):

        features = np.ascontiguousarray(features)
        index.add(features)

        for path in paths:
            fname = Path(path).name
            metadata.append(fname)

    print('Saving index file and metadata')
    IndexIO.save(args.index.with_suffix(".idx"), index)
    JsonIO.save(args.index.with_suffix(".json"), metadata)
def predict(index: faiss.IndexFlatL2, embeddings: np.ndarray,
            y_train: np.ndarray, k_neighbors: int) -> np.ndarray:

    # Searching using nearest neighbors in reference set
    _, indices = index.search(
        embeddings, k_neighbors)  # indices: shape [N_embeddings x k_neighbors]
    # Get labels of found indices
    y_pred: np.ndarray = y_train[
        indices]  # shape: [N_embeddings x k_neighbors]
    # Labels that appears most are predicted labels
    # https://en.wikipedia.org/wiki/Mode_(statistics)
    y_pred = mode(y_pred, axis=1)[0].ravel()  # shape: [N_embedding]
    return y_pred
def query(
    index: faiss.IndexFlatL2,
    query_embedding: np.ndarray,
    k_queries: int,
    ref_image_paths: List[str],
) -> Tuple[List[str], List[int], List[float]]:

    # Searching using nearest neighbors in reference set
    # indices: shape [N_embeddings x k_queries]
    # distances: shape [N_embeddings x k_queries]
    distances, indices = index.search(query_embedding, k_queries)
    indices: List[int] = indices.ravel().tolist()
    distances: List[float] = distances.ravel().tolist()
    image_paths: List[str] = [ref_image_paths[i] for i in indices]
    return image_paths, indices, distances
Beispiel #12
0
def knn(queries: torch.Tensor, index: faiss.IndexFlatL2,
        keys_store: np.ndarray, vals_store: np.ndarray, n_tokens: int):
    """
    ## $k$-NN to get $p(w_t, c_t)$

    Here we refer to $f(\textcolor{yellowgreen}{c_t})$ as queries,
    $f(c_i)$ as keys and $w_i$ as values.
    """

    # Save shape of queries to reshape results
    queries_shape = queries.shape

    # Flatten the `batch` and `sequence` dimensions of queries
    queries = queries.view(-1, queries_shape[-1])

    # Find 10 nearest neighbors of $f(\textcolor{yellowgreen}{c_t})$ among $f(c_i)$.
    # `distance` is the distance given by FAISS and `idx`, $i$ is the index of it in `keys_store`.
    distance, idx = index.search(queries.numpy(), 10)

    # Get $f(c_i)$
    keys_found = queries.new_tensor(keys_store[idx])
    # Get $w_i$
    vals_found = torch.tensor(vals_store[idx]).squeeze(-1)

    # We are going to calculate the cosine similarity between normalized vectors

    # Normalize $f(c_i)$
    keys_found_n = keys_found / torch.sqrt(
        (keys_found**2).sum(-1, keepdims=True) + 1e-10)
    # Normalize $f(\textcolor{yellowgreen}{c_t})$
    queries_n = queries / torch.sqrt((queries**2).sum(-1, keepdims=True) +
                                     1e-10)

    # Get the dot-product, or cosine similarity
    dot_prod = (keys_found_n * queries_n.unsqueeze(1)).sum(-1)

    # Token-wise logits
    logits_token = dot_prod.new_zeros(queries.shape[0], n_tokens)
    # Scatter and accumulate token logits based on the nearest neighbors
    _ = logits_token.scatter_(dim=1,
                              index=vals_found,
                              src=dot_prod,
                              reduce='add')

    # Reshape the logits
    logits_token = logits_token.reshape(queries_shape[0], queries_shape[1], -1)

    return logits_token
def calculate_precision_at_k(index: faiss.IndexFlatL2, X_test: np.ndarray,
                             y_test: np.ndarray, y_train: np.ndarray,
                             k: int) -> float:
    # Searching using nearest neighbors in train set
    _, indices = index.search(X_test,
                              k)  # indices: shape [N_embeddings x k_neighbors]

    y_pred = []
    for i in range(k):
        indices_at_k: np.ndarray = indices[:, i]  # [M]
        y_pred_at_k: np.ndarray = y_train[indices_at_k][:, None]  # [M x 1]
        y_pred.append(y_pred_at_k)

    y_pred: np.ndarray = np.hstack(y_pred)  # [M x k]
    y_test = np.hstack((y_test[:, None], ) * k)  # [M x k]

    precision_at_k: float = (
        (y_pred == y_test).sum(axis=1) / k).mean().item() * 100
    return precision_at_k
def calculate_topk_accuracy(index: faiss.IndexFlatL2, X_test: np.ndarray,
                            y_test: np.ndarray, y_train: np.ndarray,
                            top_k: int) -> float:

    # Search using nearest neighbors in train set
    _, indices = index.search(
        X_test, top_k)  # indices: shape [N_embeddings x k_neighbors]

    y_pred = []
    for i in range(top_k):
        indices_at_k: np.ndarray = indices[:, i]  # [M]
        y_pred_at_k: np.ndarray = y_train[indices_at_k][:, None]  # [M x 1]
        y_pred.append(y_pred_at_k)

    y_pred: np.ndarray = np.hstack(y_pred)  # [M x k]
    y_test = np.hstack((y_test[:, None], ) * top_k)  # [M x k]

    n_predictions: int = y_pred.shape[0]
    n_true_predictions: int = ((y_pred == y_test).sum(axis=1) > 0).sum().item()
    topk_accuracy: float = n_true_predictions / n_predictions * 100
    return topk_accuracy
def approximate(pc, r):
    """
    Point Cloud を生成する確率分布の密度関数を f としたとき,
    f の近似 f' に対して f'(pc) を返す

    Parameters
    ----------
    pc : ndarray of float32
        Point Cloud
    r : float

    Returns
    -------
    approximated_values : ndarray of float32
        Point Cloud の各点における近似関数の値
    """
    pc = pc.astype('float32')
    size = len(pc)
    index = IndexFlatL2(len(pc[0]))
    index.add(pc)
    lim = index.range_search(pc, r)[0]
    return array([lim[i + 1] - lim[i] for i in range(size)]) / (size * 2 * r)
Beispiel #16
0
def initalize_prodige(X, knn_edges=64, random_edges=32, verbose=False, **kwargs):
    # Creates graph embedding from an object-feature matrix,
    # initialize weights with squared euclidian distances |x_i - x_j|^2_2

    # 2 types of edges:
    #   knn edges - connecting vertices to their nearest neighbors
    #   Random edges - connecting random pairs of vertices to get smallworld property

    # X: matrix of samples
    # See Section 3.3: Scalability
    # knn_edges: edges per vertex to X nearest neighbours via FAISS or sklearn
    # random_edges: X random edges per vertex for smallworld property
    # kwargs: other args sent to GraphEmbedding()
    # returns: Initialized GraphEmbedding

    num_vectors, vector_dim = X.shape
    X = np.require(X, dtype=np.float32, requirements=['C_CONTIGUOUS'])

    if verbose:
        print("Searching for nearest neighbors")
    try:
        from faiss import IndexFlatL2
        index = IndexFlatL2(vector_dim)
        index.add(X)
        neighbor_distances, neighbor_indices = index.search(X, knn_edges + 1)
    except ImportError:
        print("faiss not found, using slow knn instead")
        neighbor_distances, neighbor_indices = NearestNeighbors(n_neighbors=knn_edges + 1).fit(X).kneighbors(X)

    if verbose:
        print("Adding knn edges")
    edges_from, edges_to, distances = [], [], []
    for vertex_i in np.arange(num_vectors):
        for neighbor_i, distance in zip(neighbor_indices[vertex_i], neighbor_distances[vertex_i]):
            if vertex_i == neighbor_i: continue  # prevent loops
            if neighbor_i == -1: continue  # padding
            distance **= 0.5
            edges_from.append(vertex_i)
            edges_to.append(neighbor_i)
            distances.append(distance)

    if random_edges != 0:
        if verbose: print("Adding random edges")
        random_from = np.random.randint(0, num_vectors, num_vectors * random_edges)
        random_to = np.random.randint(0, num_vectors, num_vectors * random_edges)
        for vertex_i, neighbor_i in zip(random_from, random_to):
            if vertex_i != neighbor_i:
                distance = np.sum((X[vertex_i] - X[neighbor_i]) ** 2) ** 0.5
                edges_from.append(vertex_i)
                edges_to.append(neighbor_i)
                distances.append(distance)

    if verbose: print("Deduplicating edges")
    # remove duplicate edges and add them again at random
    unique_edges_dict = {}
    for from_i, to_i, distance in zip(edges_from, edges_to, distances):
        edge_iijj = int(from_i), int(to_i)
        edge_iijj = tuple(sorted(edge_iijj))
        unique_edges_dict[edge_iijj] = distance
    edges_iijj, distances = zip(*unique_edges_dict.items())
    edges_from, edges_to = zip(*edges_iijj)

    edges_from, edges_to, distances = map(np.asanyarray, [edges_from, edges_to, distances])
    if verbose:
        print("Total edges: {}, mean edges per vertex: {}, mean distance: {}".format(
            len(edges_from), len(edges_from) / float(num_vectors), np.mean(distances)
        ))
    return GraphEmbedding(edges_from, edges_to, weights=distances, **kwargs)
Beispiel #17
0
def main(args):
    # https://github.com/facebookresearch/faiss/blob/a8118acbc516b0263dde610862c806400cc48bf5/Clustering.cpp#L78-L80
    if args.num_train < max(kNumCells, kNumCentroids):
        sys.exit("Error: require at least {} training samples".format(
            max(kNumCells, kNumCentroids)))

    extract = FeatureExtractor(image_size=args.image_size)

    dataset = ImageDirectory(root=args.frames, transform=extract.transform)
    train_dataset, index_dataset = random_split(
        dataset,
        [args.num_train, len(dataset) - args.num_train])

    if len(train_dataset) > len(index_dataset) or len(
            train_dataset) > 0.25 * len(index_dataset):
        sys.exit("Error: training dataset too big: train={}, index={}".format(
            len(train_dataset), len(index_dataset)))

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              num_workers=args.num_workers)
    index_loader = DataLoader(index_dataset,
                              batch_size=args.batch_size,
                              num_workers=args.num_workers)

    N, C = len(train_dataset), 2048

    train_features = np.empty(shape=(N, C), dtype=np.float32)

    for i, (images, paths) in enumerate(
            tqdm(train_loader, desc="Train", unit="batch", ascii=True)):
        n, h, w = images.size(0), images.size(2), images.size(3)

        features = extract(images)
        features = features.data.cpu().numpy()

        # resnet5 downsamples x2 five times
        h, w = h // 32, w // 32

        # MAC feature
        features = reduce(features,
                          "n (h w) c -> n c",
                          "max",
                          n=n,
                          h=h,
                          w=w,
                          c=C)

        train_features[i * args.batch_size:i * args.batch_size + n] = features

    quantizer = IndexFlatL2(C)

    index = IndexIVFPQ(quantizer, C, kNumCells, kNumCentroids, kNumBitsPerIdx)
    index.do_polysemous_training = True

    print("Training index on training features", file=sys.stderr)
    index.train(train_features)

    metadata = []

    for images, paths in tqdm(index_loader,
                              desc="Index",
                              unit="batch",
                              ascii=True):
        n, h, w = images.size(0), images.size(2), images.size(3)

        # resnet5 downsamples x2 five times
        h, w = h // 32, w // 32

        # MAC feature descriptor
        features = extract(images)
        features = reduce(features,
                          "n (h w) c -> n c",
                          "max",
                          n=n,
                          h=h,
                          w=w,
                          c=C)
        features = features.data.cpu().numpy()

        # C-array required for faiss FFI: tensors might not be contiguous
        features = np.ascontiguousarray(features)

        # Add a batch of (batch*49, 2048) unpooled features to the index at once
        index.add(features)

        for path in paths:
            fname = Path(path).name
            metadata.append(fname)

    IndexIO.save(args.index.with_suffix(".idx"), index)
    JsonIO.save(args.index.with_suffix(".json"), metadata)
Beispiel #18
0
def make_graph_from_vectors(X,
                            *,
                            knn_edges,
                            random_edges=0,
                            virtual_vertices=0,
                            deduplicate=True,
                            directed=True,
                            verbose=False,
                            squared=False,
                            GraphEmbeddingClass=GraphEmbedding,
                            **kwargs):
    """
    Creates graph embedding from an object-feature matrix,
    initialize weights with squared euclidian distances |x_i - x_j|^2_2

    The graph consists of three types of edges:
        * knn edges - connecting vertices to their nearest neighbors
        * random edges - connecting random pairs of vertices to get smallworld property
        * edges to virtual_vertices - adds synthetic vertices to task and connect with all other vertices
                                     (init with k-means)

    :param X: task matrix[num_vertors, vector_dim]
    :param knn_edges: connects vertex to this many nearest neighbors
    :param random_edges: adds this many random edges per vertex (long edges for smallworld property)
    :param virtual_vertices: adds this many new vertices connected to all points, initialized as centroids
    :param deduplicate: if enabled(default), removes all duplicate edges
        (e.g. if the edge was first added via :m:, and then added again via :random_rate:
    :param directed: if enabled, treats (i, j) and (j, i) as the same edge
    :param verbose: if enabled, prints progress into stdout
    :param squared: if True, uses squared euclidian distance, otherwise normal euclidian distance
    :param kwargs: other keyword args sent to :GraphEmbedding.__init__:
    :rtype: GraphEmbedding
    """
    num_vectors, vector_dim = X.shape
    X = np.require(check_numpy(X),
                   dtype=np.float32,
                   requirements=['C_CONTIGUOUS'])
    if virtual_vertices != 0:
        if verbose: print("Creating virtual vertices by k-means")
        X_clusters = KMeans(virtual_vertices).fit(X).cluster_centers_
        X = np.concatenate([X, X_clusters])

    if verbose:
        print("Searching for nearest neighbors")
    try:
        from faiss import IndexFlatL2
        index = IndexFlatL2(vector_dim)
        index.add(X)
        neighbor_distances, neighbor_indices = index.search(X, knn_edges + 1)
    except ImportError:
        warn("faiss not found, using slow knn instead")
        neighbor_distances, neighbor_indices = NearestNeighbors(
            n_neighbors=knn_edges + 1).fit(X).kneighbors(X)

    if not squared:
        neighbor_distances **= 0.5
    if verbose:
        print("Adding knn edges")
    edges_from, edges_to, distances = [], [], []
    for vertex_i in np.arange(num_vectors):
        for neighbor_i, distance in zip(neighbor_indices[vertex_i],
                                        neighbor_distances[vertex_i]):
            if vertex_i == neighbor_i: continue  # forbid loops
            if neighbor_i == -1: continue  # ANN engine uses -1 for padding
            edges_from.append(vertex_i)
            edges_to.append(neighbor_i)
            distances.append(distance)

    if random_edges != 0:
        if verbose: print("Adding random edges")
        random_from = np.random.randint(0, num_vectors,
                                        num_vectors * random_edges)
        random_to = np.random.randint(0, num_vectors,
                                      num_vectors * random_edges)
        for vertex_i, neighbor_i in zip(random_from, random_to):
            if vertex_i != neighbor_i:
                distance = np.sum((X[vertex_i] - X[neighbor_i])**2)
                if not squared: distance **= 0.5
                edges_from.append(vertex_i)
                edges_to.append(neighbor_i)
                distances.append(distance)

    if deduplicate:
        if verbose: print("Deduplicating edges")
        unique_edges_dict = {}  # {(from_i, to_i) : distance(i, j)}
        for from_i, to_i, distance in zip(edges_from, edges_to, distances):
            edge_iijj = int(from_i), int(to_i)
            if not directed:
                edge_iijj = tuple(sorted(edge_iijj))
            unique_edges_dict[edge_iijj] = distance

        edges_iijj, distances = zip(*unique_edges_dict.items())
        edges_from, edges_to = zip(*edges_iijj)

    edges_from, edges_to, distances = map(np.asanyarray,
                                          [edges_from, edges_to, distances])
    if verbose:
        print("Total edges: {}, mean edges per vertex: {}, mean distance: {}".
              format(len(edges_from),
                     len(edges_from) / float(num_vectors), np.mean(distances)))
    return GraphEmbeddingClass(edges_from,
                               edges_to,
                               initial_weights=distances,
                               directed=directed,
                               **kwargs)
Beispiel #19
0
def main(args):
    # https://github.com/facebookresearch/faiss/blob/a8118acbc516b0263dde610862c806400cc48bf5/Clustering.cpp#L78-L80
    if args.num_train < max(args.num_centroids, args.code_size):
        sys.exit("💥 Require at least {} training samples".format(
            max(args.num_centroids, args.code_size)))

    paths = [path for path in args.features.iterdir() if path.is_file()]

    print(
        "� Loading clip features from {} video feature files at {}".format(
            len(paths), args.features),
        file=sys.stderr)

    # Two passes over videos of arbitrary number of clips:
    # - First pass reservoir samples clip features, trains index
    # - Second pass adds remaining clip features to index
    #
    # This way we can properly randomly select train samples and
    # at the same time keep our peak memory consumption reasonable.

    train_samples = StreamSampler(args.num_train)

    # 1st pass

    total_clips = 0

    for i, path in enumerate(tqdm(paths)):
        feats = np.load(path, allow_pickle=False)

        assert len(feats.shape) == 2
        assert feats.shape[1] == args.dimension
        assert feats.dtype == np.float32

        for j, feat in enumerate(feats):
            # Keep train and index datasets disjoint
            # Track train clips: ith video, jth clip
            train_samples.add((feat, (i, j)))
            total_clips += 1

    if len(train_samples) < args.num_train:
        sys.exit(
            "💥 Not enough samples in dataset to train on; loaded {}".format(
                len(train_samples)))

    train_feats = [k for k, _ in train_samples]
    train_clips = {v for _, v in train_samples}

    train_feats = np.array(train_feats)
    assert train_feats.shape == (args.num_train, args.dimension)
    assert train_feats.dtype == np.float32

    quantizer = IndexFlatL2(args.dimension)

    index = IndexIVFPQ(quantizer, args.dimension, args.num_centroids,
                       args.code_size, args.num_bits)

    print(
        "🚄 Training index on {} out of {} total {}-dimensional clip features"
        .format(args.num_train, total_clips, args.dimension),
        file=sys.stderr)

    index.train(train_feats)

    del train_feats
    del train_samples

    # 2nd pass

    assert index.is_trained

    print(
        "🔖 Adding to index {} out of {} total {}-dimensional clip features".
        format(total_clips - len(train_clips), total_clips, args.dimension),
        file=sys.stderr)

    metadata = []
    batch_feats = []

    for i, path in enumerate(tqdm(paths)):
        feats = np.load(path, allow_pickle=False)

        assert len(feats.shape) == 2
        assert feats.shape[1] == args.dimension
        assert feats.dtype == np.float32

        for j, feat in enumerate(feats):
            if (i, j) in train_clips:
                continue

            batch_feats.append(feat)

            # Could be more efficient than one entry per clip.
            # This way it's simple to use in the client for now.
            metadata.append({"path": str(path), "clip": j})

            if len(batch_feats) % args.batch_size == 0:
                feats = np.array(batch_feats)
                batch_feats.clear()
                index.add(feats)

    if batch_feats:
        feats = np.array(batch_feats)
        batch_feats.clear()
        index.add(feats)

    assert index.ntotal == total_clips - len(train_clips)

    write_index(index, str(args.index.with_suffix(".idx")))

    with args.index.with_suffix(".json").open("w") as fp:
        json.dump(metadata, fp)

    print("📖 Done", file=sys.stderr)