def test_progressive_dim(self): d = 32 n = 10000 k = 50 xt, _, _ = get_dataset_2(d, n, 0, 0) # basic kmeans kmeans = faiss.Kmeans(d, k) kmeans.train(xt) clus = faiss.ProgressiveDimClustering(d, k) clus.verbose clus.verbose = True clus.progressive_dim_steps clus.progressive_dim_steps = 5 fac = faiss.ProgressiveDimIndexFactory() clus.train(n, faiss.swig_ptr(xt), fac) stats = clus.iteration_stats stats = [stats.at(i) for i in range(stats.size())] obj = np.array([st.obj for st in stats]) # clustering objective should be a tad better self.assertLess(obj[-1], kmeans.obj[-1]) # same test w/ Kmeans wrapper kmeans2 = faiss.Kmeans(d, k, progressive_dim_steps=5) kmeans2.train(xt) self.assertLess(kmeans2.obj[-1], kmeans.obj[-1])
def test_clustering(self): d = 64 n = 1000 rs = np.random.RandomState(123) x = rs.uniform(size=(n, d)).astype('float32') x *= 10 km = faiss.Kmeans(d, 32, niter=10) err32 = km.train(x) # check that objective is decreasing prev = 1e50 for o in km.obj: self.assertGreater(prev, o) prev = o km = faiss.Kmeans(d, 64, niter=10) err64 = km.train(x) # check that 64 centroids give a lower quantization error than 32 self.assertGreater(err32, err64) km = faiss.Kmeans(d, 32, niter=10, int_centroids=True) err_int = km.train(x) # check that integer centoids are not as good as float ones self.assertGreater(err_int, err32) self.assertTrue(np.all(km.centroids == np.floor(km.centroids)))
def test_init(self): d = 32 k = 5 xt, xb, xq = get_dataset_2(d, 1000, 0, 0) km = faiss.Kmeans(d, k, niter=4) km.train(xt) km2 = faiss.Kmeans(d, k, niter=4) km2.train(xt, init_centroids=km.centroids) # check that the intial objective is better for km2 than km self.assertGreater(km.obj[0], km2.obj[0] * 1.01)
def test_kmeans(self): d = 32 nb = 1000 k = 10 rs = np.random.RandomState(123) xb = rs.rand(nb, d).astype('float32') km1 = faiss.Kmeans(d, k) obj1 = km1.train(xb) km2 = faiss.Kmeans(d, k, gpu=True) obj2 = km2.train(xb) print(obj1, obj2) assert np.allclose(obj1, obj2)
def get_clusters(dataset, num_clusters, model_type="resnet50_128", batch_size=64, n_batches=500): initcache = os.path.join(ROOT_DIR, 'centroids', model_type + '_' + '_' + str(num_clusters) + '_desc_cen.hdf5') model = Net(model_type).to(device) batch_sampler = BalanceBatchSampler(dataset=dataset, n_classes=64, n_samples=1, n_batches_epoch=n_batches) data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_sampler=batch_sampler, num_workers=2) nDescriptors = batch_size * n_batches if not os.path.exists(os.path.join(ROOT_DIR, 'centroids')): os.makedirs(os.path.join(ROOT_DIR, 'centroids')) with h5py.File(initcache, mode='w') as h5: with torch.no_grad(): model.eval() print('====> Extracting Descriptors') dbFeat = h5.create_dataset("descriptors", [nDescriptors, model.encoder_dim], dtype=np.float32) for iteration, (data, target, img_file, class_id) in enumerate(data_loader): data = data.to(device) idx = iteration * batch_size dbFeat[idx:idx + batch_size, :] = F.normalize(model(data), p=2, dim=1).cpu().numpy() print('====> Clustering..') niter = 100 kmeans = faiss.Kmeans(model.encoder_dim, num_clusters, niter=niter, verbose=False) kmeans.train(dbFeat[...]) print('====> Storing centroids', kmeans.centroids.shape) h5.create_dataset('centroids', data=kmeans.centroids) print('====> Done!')
def get_tag_embeddings(self, name, cluster_mode): if cluster_mode: cluster_items = self.get_tag_contents(name, True) if cluster_items is None: return None embeddings = [] rel_db = self.related_database() for cluster_item in cluster_items: fix_idx = rel_db.i2b(cluster_item[0][0]) face_id = s2b(cluster_item[0][1]) embeddings.append(rel_db.get_face(fix_idx, face_id)['embedding'].reshape((512,))) return np.array(embeddings) if name not in self.tag_map or len(self.tag_map[name]) < 1: return None embeddings = [] for _, _, _, embedding in self.tag_map[name]: embeddings.append(embedding) embeddings = np.array(embeddings) n_emb = embeddings.shape[0] if False and n_emb > 78*3: # FIXME: Disabled due to bugs kmeans = faiss.Kmeans(512, min(n_emb // 39, 150), niter=10, verbose=False) if name in self.cluster_map: kmeans = self.cluster_map[name] else: kmeans.train(embeddings) self.cluster_map[name] = kmeans if kmeans.centroids.shape[0] > 5: return kmeans.centroids else: return np.append(embeddings[1::3], kmeans.centroids, axis=0) return np.array(embeddings)
def cluster_features_and_label(args: Namespace, cfg: AttrDict): # faiss is an optional dependency for VISSL. assert is_faiss_available(), ( "Please install faiss using conda install faiss-gpu -c pytorch " "if using conda or pip install faiss-gpu" ) import faiss cluster_backend = cfg.CLUSTERFIT.CLUSTER_BACKEND num_clusters = cfg.CLUSTERFIT.NUM_CLUSTERS data_split = cfg.CLUSTERFIT.FEATURES.DATA_PARTITION data_name = cfg.CLUSTERFIT.FEATURES.DATASET_NAME n_iter = cfg.CLUSTERFIT.N_ITER output_dir = get_checkpoint_folder(cfg) ########### Step 1: Extract the features on full dataset ################### feature_data, image_paths = get_data_features_and_images(cfg) ########### Step 2: Get the data information ################### features = feature_data["features"] # features are of shape num_samples x feature_dim assert features.ndim == 2, f"Features incorrect shape: {features.shape}" assert features.dtype == np.float32, "Features are not float32 type" logging.info(f"Clustering Features: {features.shape}") ########### Step 3: L2 normalize features ################### # TODO: we could support PCA here if needed in future. logging.info("L2 normalizing the features now...") feat_norm = np.linalg.norm(features, axis=1) + 1e-5 features = features / feat_norm[:, np.newaxis] ########### Step 4: Cluster the features ################### logging.info("Clustering the features now...") assert cluster_backend == "faiss", "Only faiss clustering is supported currently" kmeans = faiss.Kmeans(features.shape[1], num_clusters, niter=n_iter, verbose=True) kmeans.train(features) centroids = kmeans.centroids ########### Step 5: Get the cluster assignment for the features ############ logging.info("Getting cluster label assignment now...") distances, hard_cluster_labels = kmeans.index.search(features, 1) #### Step 6: Save clustering data and hard cluster labels for the images ### data_split = data_split.lower() clustering_output_dict = { "hard_labels": hard_cluster_labels, "centroids": centroids, "distances": distances, } cluster_output_filepath = ( f"{output_dir}/{data_name}_{data_split}_N{num_clusters}_{cluster_backend}.pkl" ) hard_labels_output_filepath = ( f"{output_dir}/" f"{data_name}_{data_split}_N{num_clusters}_{cluster_backend}_lbls.npy" ) out_hard_labels = np.array(hard_cluster_labels.tolist(), dtype=np.int64).reshape(-1) save_file(clustering_output_dict, cluster_output_filepath) save_file(out_hard_labels, hard_labels_output_filepath) logging.info("All Done!")
def label_generator_kmeans(cfg, features, num_classes=500, cuda=True, **kwargs): assert cfg.TRAIN.PSEUDO_LABELS.cluster == "kmeans" assert num_classes, "num_classes for kmeans is null" # num_classes = cfg.TRAIN.PSEUDO_LABELS.cluster_num if not cfg.TRAIN.PSEUDO_LABELS.use_outliers: warnings.warn("there exists no outlier point by kmeans clustering") # k-means cluster by faiss cluster = faiss.Kmeans(features.size(-1), num_classes, niter=300, verbose=True, gpu=cuda) cluster.train(to_numpy(features)) centers = to_torch(cluster.centroids).float() _, labels = cluster.index.search(to_numpy(features), 1) labels = labels.reshape(-1) labels = to_torch(labels).long() # k-means does not have outlier points assert not (-1 in labels) return labels, centers, num_classes, None
def __init__(self, feature_path, center_num, num_per_category): print('Start clustering...') self.image_paths, self.features = joblib.load( os.path.join(feature_path) ) self.features = self.features.astype('float32') self.center_num = center_num d = self.features.shape[1] # construct kmeans using faiss self.kmeans = faiss.Kmeans(d, self.center_num, niter=100, verbose=True) self.kmeans.train(self.features) self.index = faiss.IndexFlatL2(d) self.index.add(self.features) # get the results # D contains the squared L2 distances # I contains the nearest neighbors for each centroid self.D, self.I = self.index.search(self.kmeans.centroids, num_per_category) self.img_paths = [] self.scores = [] # so we enumerate I, to get every centroid's neighbors for i, item in enumerate(self.I): temp_paths = [] temp_scores = [] for j, index in enumerate(item): temp_paths.append(self.image_paths[index]) temp_scores.append(self.D[i][j]) self.img_paths.append(temp_paths) self.scores.append(temp_scores) print('Job done.')
def generate_vbow(k, descriptor_list, cat_dict, niter=100, nredo=10): # A k-means clustering algorithm who takes 2 parameter which is number of cluster(k) and the other is descriptors list(unordered 1d array) # Returns an array that holds central points. print( f"--- Creating VBoW with {k} words from a list of {len(descriptor_list)}---" ) x = np.array(descriptor_list) shp = x.shape[1] kmeans = faiss.Kmeans(shp, k, niter=niter, verbose=True, nredo=nredo) kmeans.train(x) # Takes 2 parameters. The first one is a dictionary that holds the descriptors that are separated class by class # And the second parameter is an array that holds the central points (visual words) of the k means clustering # Returns a dictionary that holds the histograms for each images that are separated class by class. vbow_dict = dict() for cat_key, desc_list in cat_dict.items(): cat_list = list() for d in desc_list: histogram = np.zeros(k) if d is not None: #To compute the mapping from a set of vectors x to the cluster centroids after kmeans has finished training, use: try: dist, ind = kmeans.index.search(d, 1) for i in ind: histogram[i] += 1 except Exception as e_d: print(f"Erro ao processar d: {d}") cat_list.append(histogram) vbow_dict[cat_key] = cat_list return vbow_dict
def run_kmeans(features, n_cluster): n_samples, dim = features.shape kmeans = faiss.Kmeans(dim, n_cluster) kmeans.n_iter, kmeans.min_points_per_centroid, kmeans.max_points_per_centroid = 20, 5, 1000000000 kmeans.train(features) _, cluster_assignments = kmeans.index.search(features, 1) return cluster_assignments
def run_kmeans(vecs, ncentroids=10, niter=20, device=0, verbose=True): dim = vecs.shape[1] if device == -1: print(" On CPU") kmeans = faiss.Kmeans(dim, ncentroids, niter=niter, verbose=verbose) kmeans.train(vecs) distances, groups = kmeans.index.search(vecs, 1) else: print(" On GPU") if vecs.sum() == 0: msg = "All Image has no value. " msg += "Please retry with the other weight." print(msg) clus = faiss.Clustering(dim, ncentroids) clus.verbose = verbose clus.niter = niter clus.max_points_per_centroid = 10000000 res = faiss.StandardGpuResources() cfg = faiss.GpuIndexFlatConfig() cfg.useFloat16 = False cfg.device = device index = faiss.GpuIndexFlatL2(res, dim, cfg) clus.train(vecs, index) distances, groups = index.search(vecs, 1) return groups
def kmeans_trial(data: np.ndarray, nc: int) -> tuple: ''' number of clusters -> (labels, silhouette coef) ''' # [slow, sklearn] #centroids, labels, _ = k_means(data, nc) # [fast, faiss] npvecs = data.astype('float32') ncls = nc # [use cuda] #gpu_resource = faiss.StandardGpuResources() #cluster_idx = faiss.IndexFlatL2(npvecs.shape[1]) #cluster_idx = faiss.index_cpu_to_gpu(gpu_resource, 0, cluster_idx) #kmeans = faiss.Clustering(npvecs.shape[1], ncls) #kmeans.verbose = False #kmeans.train(npvecs, cluster_idx) #_, pred = cluster_idx.search(npvecs, 1) #pred = pred.flatten() #labels = pred #centroids = np.array([kmeans.centroids.at(i) for i in # range(kmeans.centroids.size())]) # [use cpu] kmeans = faiss.Kmeans(npvecs.shape[1], ncls, seed=123, verbose=False, min_points_per_centroid=2, max_points_per_centroid=128) kmeans.train(npvecs) _, pred = kmeans.index.search(npvecs, 1) pred = pred.flatten() labels = pred centroids = kmeans.centroids silc = silhouette_score(data, labels) return (labels, centroids, silc)
def test_stats(self): d = 32 k = 5 xt, xb, xq = get_dataset_2(d, 1000, 0, 0) km = faiss.Kmeans(d, k, niter=4) km.train(xt) assert list(km.obj) == [st['obj'] for st in km.iteration_stats]
def get_clusters(cluster_set): nDescriptors = 50000 nPerImage = 100 nIm = ceil(nDescriptors / nPerImage) sampler = SubsetRandomSampler( np.random.choice(len(cluster_set), nIm, replace=False)) data_loader = DataLoader(dataset=cluster_set, num_workers=opt.threads, batch_size=opt.cacheBatchSize, shuffle=False, pin_memory=cuda, sampler=sampler) if not exists(join(opt.dataPath, 'centroids')): makedirs(join(opt.dataPath, 'centroids')) initcache = join( opt.dataPath, 'centroids', opt.arch + '_' + cluster_set.dataset + '_' + str(opt.num_clusters) + '_desc_cen.hdf5') with h5py.File(initcache, mode='w') as h5: with torch.no_grad(): model.eval() print('====> Extracting Descriptors') dbFeat = h5.create_dataset("descriptors", [nDescriptors, encoder_dim], dtype=np.float32) for iteration, (input, indices) in enumerate(data_loader, 1): input = input.to(device) image_descriptors = model.encoder(input).view( input.size(0), encoder_dim, -1).permute(0, 2, 1) batchix = (iteration - 1) * opt.cacheBatchSize * nPerImage for ix in range(image_descriptors.size(0)): # sample different location for each image in batch sample = np.random.choice(image_descriptors.size(1), nPerImage, replace=False) startix = batchix + ix * nPerImage dbFeat[startix:startix + nPerImage, :] = image_descriptors[ ix, sample, :].detach().cpu().numpy() if iteration % 50 == 0 or len(data_loader) <= 10: print("==> Batch ({}/{})".format( iteration, ceil(nIm / opt.cacheBatchSize)), flush=True) del input, image_descriptors print('====> Clustering..') niter = 100 kmeans = faiss.Kmeans(encoder_dim, opt.num_clusters, niter=niter, verbose=False) kmeans.train(dbFeat[...]) print('====> Storing centroids', kmeans.centroids.shape) h5.create_dataset('centroids', data=kmeans.centroids) print('====> Done!')
def fit(self, data): n_data, d = data.shape self.clus = faiss.Kmeans(d, self.n_clusters) self.clus.seed = np.random.randint(1234) self.clus.niter = 20 self.clus.max_points_per_centroid = 10000000 self.clus.train(data)
def faiss_kmeans(infile, ncentroids, niter=20): """ K-Means clustering with FAISS :param infile: Input file name :param ncentroids: desired number of clusters :param niter: maximum number of iterations :return: None """ h5f = h5py.File(infile, 'r') x = h5f['fp_list'][:] smiles_list = h5f['smiles_list'][:] name_list = h5f['name_list'][:] h5f.close() verbose = True d = x.shape[1] kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose) kmeans.train(x) D, I = kmeans.index.search(x, 1) writer = csv.writer(open("detail.csv", "w")) writer.writerow(["SMILES", "NAME", "DIST", "CLUSTER"]) for smiles, name, d, i in zip(smiles_list, name_list, [x[0] for x in D], [x[0] for x in I]): writer.writerow( [smiles[0].decode('utf-8'), name[0].decode('utf-8'), d, i]) dist_list = get_centers(ncentroids, [x[0] for x in D], [x[0] for x in I]) ofs = open("centers.smi", "w") for cluster_idx, (_, c) in enumerate(dist_list, 1): print(smiles_list[c][0].decode('utf-8'), name_list[c][0].decode('utf-8'), cluster_idx, file=ofs) ofs.close()
def run_kmeans_faiss(x: Union[np.ndarray, Tensor], nmb_clusters: int, n_iter: int, cuda: bool, verbose: bool = False) -> Tensor: if isinstance(x, torch.Tensor): x = x.numpy() x = np.reshape(x, (x.shape[0], -1)) n_data, d = x.shape if cuda: # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) clus.niter = n_iter clus.max_points_per_centroid = 10000000 clus.verbose = verbose res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.useFloat16 = False index = faiss.GpuIndexFlatL2(res, d, flat_config) # perform the training clus.train(x, index) flat_config.device = 0 _, I = index.search(x, 1) else: kmeans = faiss.Kmeans(d=d, k=nmb_clusters, verbose=verbose, niter=20) kmeans.train(x) _, I = kmeans.index.search(x, 1) I = torch.as_tensor(I, dtype=torch.long).squeeze() return I
def kmeans(x, ncentroids, niter, verbose): print('kmeans') d = x.shape[1] kmeans = faiss.Kmeans(d, ncentroids, niter, verbose) kmeans.cp.max_points_per_centroid = 100000 print('train kmeans') kmeans.train(x) return kmeans
def fit(self, X, y): self.kmeans = faiss.Kmeans(d=X.shape[1], k=self.n_clusters, niter=self.max_iter, nredo=self.n_init) self.kmeans.train(X.astype(np.float32)) self.cluster_centers_ = self.kmeans.centroids self.inertia_ = self.kmeans.obj[-1]
def test_1ptpercluster(self): # https://github.com/facebookresearch/faiss/issues/842 X = np.random.randint(0, 1, (5, 10)).astype('float32') k = 5 niter = 10 verbose = True kmeans = faiss.Kmeans(X.shape[1], k, niter=niter, verbose=verbose) kmeans.train(X) l2_distances, I = kmeans.index.search(X, 1)
def get_clusters(ftrain, nclusters): kmeans = faiss.Kmeans(ftrain.shape[1], nclusters, niter=100, verbose=False, gpu=False) kmeans.train(np.random.permutation(ftrain)) _, ypred = kmeans.assign(ftrain) return ypred
def kmeans_train(self, vecs): import faiss kmeans = faiss.Kmeans(vecs.shape[1], self.num_clusters, niter=5, verbose=False) kmeans.train(vecs) self.centroids = kmeans.centroids
def test_progressive_dim(self): d = 32 n = 10000 k = 50 xt, _, _ = get_dataset_2(d, n, 0, 0) # basic kmeans kmeans = faiss.Kmeans(d, k, gpu=True) kmeans.train(xt) pca = faiss.PCAMatrix(d, d) pca.train(xt) xt_pca = pca.apply(xt) # same test w/ Kmeans wrapper kmeans2 = faiss.Kmeans(d, k, progressive_dim_steps=5, gpu=True) kmeans2.train(xt_pca) self.assertLess(kmeans2.obj[-1], kmeans.obj[-1])
def test_binary(self): ds = datasets.SyntheticDataset(128, 2000, 2000, 200) d = ds.d xt = ds.get_train() xq = ds.get_queries() xb = ds.get_database() # define alternative quantizer on the 20 first dims of vectors (will be in float) km = faiss.Kmeans(20, 50) km.train(xt[:, :20].copy()) alt_quantizer = km.index binarizer = faiss.index_factory(d, "ITQ,LSHt") binarizer.train(xt) xb_bin = binarizer.sa_encode(xb) xq_bin = binarizer.sa_encode(xq) index = faiss.index_binary_factory(d, "BIVF200") fake_centroids = np.zeros((index.nlist, index.d // 8), dtype="uint8") index.quantizer.add(fake_centroids) index.is_trained = True # add elements xb a = alt_quantizer.search(xb[:, :20].copy(), 1)[1].ravel() ivf_tools.add_preassigned(index, xb_bin, a) # search elements xq, increase nprobe, check 4 first results w/ groundtruth prev_inter_perf = 0 for nprobe in 1, 10, 20: index.nprobe = nprobe a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1] D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a) inter_perf = (I == ds.get_groundtruth()[:, :4]).sum() / I.size self.assertTrue(inter_perf >= prev_inter_perf) prev_inter_perf = inter_perf # test range search index.nprobe = 20 a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1] # just to find a reasonable radius D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a) radius = int(D.max() + 1) lims, DR, IR = ivf_tools.range_search_preassigned( index, xq_bin, radius, a) # with that radius the k-NN results are a subset of the range search results for q in range(len(xq)): l0, l1 = lims[q], lims[q + 1] self.assertTrue(set(I[q]) <= set(IR[l0:l1]))
def test_nasty_clustering(self): d = 2 rs = np.random.RandomState(123) x = np.zeros((100, d), dtype='float32') for i in range(5): x[i * 20:i * 20 + 20] = rs.uniform(size=d) # we have 5 distinct points but ask for 10 centroids... km = faiss.Kmeans(d, 10, niter=10, verbose=True) km.train(x)
def find_anchors_(data_list, m): anchor_list = [] for X in data_list: X = X.astype(np.float32) n, d = X.shape kmeans = faiss.Kmeans(d, m, niter=20, verbose=False) kmeans.train(X) anchors = kmeans.centroids anchor_list.append(anchors) return anchor_list
def find_anchors(data_list, m): anchor_list = [] for i, X in enumerate(data_list): X = X.astype(np.float32) d = X.shape[1] kmeans = faiss.Kmeans(d, m[i], niter=20, verbose=False) kmeans.train(X) anchors = kmeans.centroids anchor_list.append(anchors) return anchor_list
def kMeans(data, numCluster, useGpu=True): kFunc = faiss.Kmeans(data.shape[1], numCluster, gpu=useGpu) kFunc.cp.max_points_per_centroid = ((data.shape[0] + numCluster - 1) // numCluster) if data.is_cuda: data = data.cpu() kFunc.train(data.numpy()) # assign labels _, labels = kFunc.index.search(data.numpy(), 1) return kFunc.centroids, labels.squeeze()
def train_kmeans(self, vecs): import faiss kmeans_instance = faiss.Kmeans(self.vec_dim, self.kmeans_clusters, niter=10) if vecs.dtype != np.float32: vecs = vecs.astype(np.float32) kmeans_instance.train(vecs) centroids = kmeans_instance.centroids return centroids