def cluster_features_and_label(args: Namespace, cfg: AttrDict): # faiss is an optional dependency for VISSL. assert is_faiss_available(), ( "Please install faiss using conda install faiss-gpu -c pytorch " "if using conda or pip install faiss-gpu" ) import faiss cluster_backend = cfg.CLUSTERFIT.CLUSTER_BACKEND num_clusters = cfg.CLUSTERFIT.NUM_CLUSTERS data_split = cfg.CLUSTERFIT.FEATURES.DATA_PARTITION data_name = cfg.CLUSTERFIT.FEATURES.DATASET_NAME n_iter = cfg.CLUSTERFIT.N_ITER output_dir = get_checkpoint_folder(cfg) ########### Step 1: Extract the features on full dataset ################### feature_data, image_paths = get_data_features_and_images(cfg) ########### Step 2: Get the data information ################### features = feature_data["features"] # features are of shape num_samples x feature_dim assert features.ndim == 2, f"Features incorrect shape: {features.shape}" assert features.dtype == np.float32, "Features are not float32 type" logging.info(f"Clustering Features: {features.shape}") ########### Step 3: L2 normalize features ################### # TODO: we could support PCA here if needed in future. logging.info("L2 normalizing the features now...") feat_norm = np.linalg.norm(features, axis=1) + 1e-5 features = features / feat_norm[:, np.newaxis] ########### Step 4: Cluster the features ################### logging.info("Clustering the features now...") assert cluster_backend == "faiss", "Only faiss clustering is supported currently" kmeans = faiss.Kmeans(features.shape[1], num_clusters, niter=n_iter, verbose=True) kmeans.train(features) centroids = kmeans.centroids ########### Step 5: Get the cluster assignment for the features ############ logging.info("Getting cluster label assignment now...") distances, hard_cluster_labels = kmeans.index.search(features, 1) #### Step 6: Save clustering data and hard cluster labels for the images ### data_split = data_split.lower() clustering_output_dict = { "hard_labels": hard_cluster_labels, "centroids": centroids, "distances": distances, } cluster_output_filepath = ( f"{output_dir}/{data_name}_{data_split}_N{num_clusters}_{cluster_backend}.pkl" ) hard_labels_output_filepath = ( f"{output_dir}/" f"{data_name}_{data_split}_N{num_clusters}_{cluster_backend}_lbls.npy" ) out_hard_labels = np.array(hard_cluster_labels.tolist(), dtype=np.int64).reshape(-1) save_file(clustering_output_dict, cluster_output_filepath) save_file(out_hard_labels, hard_labels_output_filepath) logging.info("All Done!")
def cluster_features(cfg: AttrDict): assert is_faiss_available(), ( "Please install faiss using conda install faiss-gpu -c pytorch " "if using conda or pip install faiss-gpu") import faiss num_clusters = cfg.CLUSTERFIT.NUM_CLUSTERS cluster_backend = cfg.CLUSTERFIT.CLUSTER_BACKEND data_split = cfg.CLUSTERFIT.FEATURES.DATA_PARTITION # Step 1: get a sub-sample of the extract features on the whole dataset # in order to compute the centroids feature_data = get_data_features_for_k_means(cfg) features = feature_data["features"] assert features.ndim == 2, f"Invalid feature shape: {features.shape}" assert features.dtype == np.float32, "Features are not float32 type" logging.info(f"Loaded features: {features.shape}") # Step 2: normalize the features and apply dimensionality reduction logging.info("Normalizing the features...") feat_norm = np.linalg.norm(features, axis=1) + 1e-5 features = features / feat_norm[:, np.newaxis] with_dimensionality_reduction = cfg.CLUSTERFIT.FEATURES.DIMENSIONALITY_REDUCTION > 0 if with_dimensionality_reduction: pca = PCA( n_components=cfg.CLUSTERFIT.FEATURES.DIMENSIONALITY_REDUCTION) features = pca.fit_transform(features) features = np.ascontiguousarray(features) features_dim = cfg.CLUSTERFIT.FEATURES.DIMENSIONALITY_REDUCTION else: pca = None features_dim = features.shape[1] # Step 3: compute the centroids for the sub-sampled features logging.info( f"Clustering {features.shape[0]} features in {num_clusters} clusters..." ) assert cluster_backend == "faiss", "Only faiss clustering is supported currently" use_gpu = torch.cuda.device_count() > 0 num_iter = cfg.CLUSTERFIT.NUM_ITER kmeans = faiss.Kmeans(features.shape[1], num_clusters, niter=num_iter, verbose=True, gpu=use_gpu) kmeans.train(features) # Step 4: compute the cluster assignment for each of the features of the dataset # by streaming through the features (to avoid OOM) and save clustering data # and hard cluster labels for the images _create_dataset_split(cfg, data_split, features_dim, kmeans, pca) if cfg.CLUSTERFIT.FEATURES.TEST_PARTITION: test_split = cfg.CLUSTERFIT.FEATURES.TEST_PARTITION _create_dataset_split(cfg, test_split, features_dim, kmeans, pca) logging.info("All Done!")
def rank_features(args: Namespace, cfg: AttrDict): # faiss is an optional dependency for VISSL. assert is_faiss_available(), ( "Please install faiss using conda install faiss-gpu -c pytorch " "if using conda or pip install faiss-gpu") import faiss ranking_backend = cfg.RANKING.RANKING_BACKEND data_split = cfg.RANKING.FEATURES.DATA_PARTITION data_name = cfg.RANKING.FEATURES.DATASET_NAME output_dir = get_checkpoint_folder(cfg) ########### Step 1: Extract the features on full dataset ################### feature_data, image_paths = get_data_features_and_images(cfg) ########### Step 2: Get the data information ################### features = feature_data["features"] # features are of shape num_samples x feature_dim assert features.ndim == 2, f"Features incorrect shape: {features.shape}" assert features.dtype == np.float32, "Features are not float32 type" logging.info(f"Ranking Features: {features.shape}") ########### Step 3: Optionally L2 normalize features ################### if cfg.RANKING.APPLY_PCA: logging.info("L2 normalizing the features now...") feat_norm = np.linalg.norm(features, axis=1) + 1e-5 features = features / feat_norm[:, np.newaxis] logging.info(f"Projecting down to {cfg.RANKING.PCA_DIM} dims ...") features = PCA( n_components=cfg.RANKING.PCA_DIM).fit_transform(features) logging.info(f"PCA features: {features.shape}") if cfg.RANKING.NORMALIZE_FEATS: logging.info("L2 normalizing the features now...") feat_norm = np.linalg.norm(features, axis=1) + 1e-5 features = features / feat_norm[:, np.newaxis] ########### Step 4: Build the L2 index on the features ################### logging.info( "Building the L2 index and searching nearest neighbor with faiss now..." ) assert ranking_backend == "faiss", "Only faiss clustering is supported currently" if cfg.RANKING.USE_GPU: logging.info("Using gpu for faiss indexing...") index = faiss.GpuIndexFlatL2( faiss.StandardGpuResources(), features.shape[1], ) else: logging.info("Using CPU for faiss indexing...") index = faiss.IndexFlatL2(features.shape[1]) index.add(features) logging.info("Doing the nearest neighbor search now...") # Num. neighbors here is 2, so for a given point we find that same point at # distance 0, and its nearest neighbor distances, nn_indices = index.search(features, 2) # Remove distance to self, which is always 0 distances = [d[1] for d in distances] ########### Step 5: Sorting the distances now ############ logging.info("Sorting and ranking based on the L2 distance now...") img_paths_and_distances = zip(image_paths, distances) img_paths_and_distances = sorted(img_paths_and_distances, key=lambda x: x[1], reverse=True) paths, distances = [x[0] for x in img_paths_and_distances ], [x[1] for x in img_paths_and_distances] #### Step 6: Save image paths and distances... ### data_split = data_split.lower() ranking_output_dict = { "img_paths": paths, "distances": distances, } ranking_output_filepath = ( f"{output_dir}/ranking_{data_name}_{data_split}_{ranking_backend}.pkl") save_file(ranking_output_dict, ranking_output_filepath) logging.info("All Done!")