Exemple #1
0
def cluster_features_and_label(args: Namespace, cfg: AttrDict):
    # faiss is an optional dependency for VISSL.
    assert is_faiss_available(), (
        "Please install faiss using conda install faiss-gpu -c pytorch "
        "if using conda or pip install faiss-gpu"
    )
    import faiss

    cluster_backend = cfg.CLUSTERFIT.CLUSTER_BACKEND
    num_clusters = cfg.CLUSTERFIT.NUM_CLUSTERS
    data_split = cfg.CLUSTERFIT.FEATURES.DATA_PARTITION
    data_name = cfg.CLUSTERFIT.FEATURES.DATASET_NAME
    n_iter = cfg.CLUSTERFIT.N_ITER
    output_dir = get_checkpoint_folder(cfg)

    ########### Step 1: Extract the features on full dataset ###################
    feature_data, image_paths = get_data_features_and_images(cfg)

    ########### Step 2: Get the data information ###################
    features = feature_data["features"]
    # features are of shape num_samples x feature_dim
    assert features.ndim == 2, f"Features incorrect shape: {features.shape}"
    assert features.dtype == np.float32, "Features are not float32 type"
    logging.info(f"Clustering Features: {features.shape}")

    ########### Step 3: L2 normalize features ###################
    # TODO: we could support PCA here if needed in future.
    logging.info("L2 normalizing the features now...")
    feat_norm = np.linalg.norm(features, axis=1) + 1e-5
    features = features / feat_norm[:, np.newaxis]

    ########### Step 4: Cluster the features ###################
    logging.info("Clustering the features now...")
    assert cluster_backend == "faiss", "Only faiss clustering is supported currently"
    kmeans = faiss.Kmeans(features.shape[1], num_clusters, niter=n_iter, verbose=True)
    kmeans.train(features)
    centroids = kmeans.centroids

    ########### Step 5: Get the cluster assignment for the features ############
    logging.info("Getting cluster label assignment now...")
    distances, hard_cluster_labels = kmeans.index.search(features, 1)

    #### Step 6: Save clustering data and hard cluster labels for the images ###
    data_split = data_split.lower()
    clustering_output_dict = {
        "hard_labels": hard_cluster_labels,
        "centroids": centroids,
        "distances": distances,
    }
    cluster_output_filepath = (
        f"{output_dir}/{data_name}_{data_split}_N{num_clusters}_{cluster_backend}.pkl"
    )
    hard_labels_output_filepath = (
        f"{output_dir}/"
        f"{data_name}_{data_split}_N{num_clusters}_{cluster_backend}_lbls.npy"
    )
    out_hard_labels = np.array(hard_cluster_labels.tolist(), dtype=np.int64).reshape(-1)
    save_file(clustering_output_dict, cluster_output_filepath)
    save_file(out_hard_labels, hard_labels_output_filepath)
    logging.info("All Done!")
Exemple #2
0
def cluster_features(cfg: AttrDict):
    assert is_faiss_available(), (
        "Please install faiss using conda install faiss-gpu -c pytorch "
        "if using conda or pip install faiss-gpu")
    import faiss

    num_clusters = cfg.CLUSTERFIT.NUM_CLUSTERS
    cluster_backend = cfg.CLUSTERFIT.CLUSTER_BACKEND
    data_split = cfg.CLUSTERFIT.FEATURES.DATA_PARTITION

    # Step 1: get a sub-sample of the extract features on the whole dataset
    # in order to compute the centroids
    feature_data = get_data_features_for_k_means(cfg)
    features = feature_data["features"]
    assert features.ndim == 2, f"Invalid feature shape: {features.shape}"
    assert features.dtype == np.float32, "Features are not float32 type"
    logging.info(f"Loaded features: {features.shape}")

    # Step 2: normalize the features and apply dimensionality reduction
    logging.info("Normalizing the features...")
    feat_norm = np.linalg.norm(features, axis=1) + 1e-5
    features = features / feat_norm[:, np.newaxis]
    with_dimensionality_reduction = cfg.CLUSTERFIT.FEATURES.DIMENSIONALITY_REDUCTION > 0
    if with_dimensionality_reduction:
        pca = PCA(
            n_components=cfg.CLUSTERFIT.FEATURES.DIMENSIONALITY_REDUCTION)
        features = pca.fit_transform(features)
        features = np.ascontiguousarray(features)
        features_dim = cfg.CLUSTERFIT.FEATURES.DIMENSIONALITY_REDUCTION
    else:
        pca = None
        features_dim = features.shape[1]

    # Step 3: compute the centroids for the sub-sampled features
    logging.info(
        f"Clustering {features.shape[0]} features in {num_clusters} clusters..."
    )
    assert cluster_backend == "faiss", "Only faiss clustering is supported currently"
    use_gpu = torch.cuda.device_count() > 0
    num_iter = cfg.CLUSTERFIT.NUM_ITER
    kmeans = faiss.Kmeans(features.shape[1],
                          num_clusters,
                          niter=num_iter,
                          verbose=True,
                          gpu=use_gpu)
    kmeans.train(features)

    # Step 4: compute the cluster assignment for each of the features of the dataset
    # by streaming through the features (to avoid OOM) and save clustering data
    # and hard cluster labels for the images
    _create_dataset_split(cfg, data_split, features_dim, kmeans, pca)
    if cfg.CLUSTERFIT.FEATURES.TEST_PARTITION:
        test_split = cfg.CLUSTERFIT.FEATURES.TEST_PARTITION
        _create_dataset_split(cfg, test_split, features_dim, kmeans, pca)
    logging.info("All Done!")
def rank_features(args: Namespace, cfg: AttrDict):
    # faiss is an optional dependency for VISSL.
    assert is_faiss_available(), (
        "Please install faiss using conda install faiss-gpu -c pytorch "
        "if using conda or pip install faiss-gpu")
    import faiss

    ranking_backend = cfg.RANKING.RANKING_BACKEND
    data_split = cfg.RANKING.FEATURES.DATA_PARTITION
    data_name = cfg.RANKING.FEATURES.DATASET_NAME
    output_dir = get_checkpoint_folder(cfg)

    ########### Step 1: Extract the features on full dataset ###################
    feature_data, image_paths = get_data_features_and_images(cfg)

    ########### Step 2: Get the data information ###################
    features = feature_data["features"]
    # features are of shape num_samples x feature_dim
    assert features.ndim == 2, f"Features incorrect shape: {features.shape}"
    assert features.dtype == np.float32, "Features are not float32 type"
    logging.info(f"Ranking Features: {features.shape}")

    ########### Step 3: Optionally L2 normalize features ###################
    if cfg.RANKING.APPLY_PCA:
        logging.info("L2 normalizing the features now...")
        feat_norm = np.linalg.norm(features, axis=1) + 1e-5
        features = features / feat_norm[:, np.newaxis]
        logging.info(f"Projecting down to {cfg.RANKING.PCA_DIM} dims ...")
        features = PCA(
            n_components=cfg.RANKING.PCA_DIM).fit_transform(features)
        logging.info(f"PCA features: {features.shape}")

    if cfg.RANKING.NORMALIZE_FEATS:
        logging.info("L2 normalizing the features now...")
        feat_norm = np.linalg.norm(features, axis=1) + 1e-5
        features = features / feat_norm[:, np.newaxis]

    ########### Step 4: Build the L2 index on the features ###################
    logging.info(
        "Building the L2 index and searching nearest neighbor with faiss now..."
    )
    assert ranking_backend == "faiss", "Only faiss clustering is supported currently"
    if cfg.RANKING.USE_GPU:
        logging.info("Using gpu for faiss indexing...")
        index = faiss.GpuIndexFlatL2(
            faiss.StandardGpuResources(),
            features.shape[1],
        )
    else:
        logging.info("Using CPU for faiss indexing...")
        index = faiss.IndexFlatL2(features.shape[1])
    index.add(features)
    logging.info("Doing the nearest neighbor search now...")
    # Num. neighbors here is 2, so for a given point we find that same point at
    # distance 0, and its nearest neighbor
    distances, nn_indices = index.search(features, 2)
    # Remove distance to self, which is always 0
    distances = [d[1] for d in distances]

    ########### Step 5: Sorting the distances now ############
    logging.info("Sorting and ranking based on the L2 distance now...")
    img_paths_and_distances = zip(image_paths, distances)
    img_paths_and_distances = sorted(img_paths_and_distances,
                                     key=lambda x: x[1],
                                     reverse=True)
    paths, distances = [x[0] for x in img_paths_and_distances
                        ], [x[1] for x in img_paths_and_distances]

    #### Step 6: Save image paths and distances... ###
    data_split = data_split.lower()
    ranking_output_dict = {
        "img_paths": paths,
        "distances": distances,
    }
    ranking_output_filepath = (
        f"{output_dir}/ranking_{data_name}_{data_split}_{ranking_backend}.pkl")
    save_file(ranking_output_dict, ranking_output_filepath)
    logging.info("All Done!")