Esempio n. 1
0
    def test_progressive_dim(self):
        d = 32
        n = 10000
        k = 50
        xt, _, _ = get_dataset_2(d, n, 0, 0)

        # basic kmeans
        kmeans = faiss.Kmeans(d, k)
        kmeans.train(xt)

        clus = faiss.ProgressiveDimClustering(d, k)
        clus.verbose
        clus.verbose = True
        clus.progressive_dim_steps
        clus.progressive_dim_steps = 5
        fac = faiss.ProgressiveDimIndexFactory()
        clus.train(n, faiss.swig_ptr(xt), fac)

        stats = clus.iteration_stats
        stats = [stats.at(i) for i in range(stats.size())]
        obj = np.array([st.obj for st in stats])
        # clustering objective should be a tad better
        self.assertLess(obj[-1], kmeans.obj[-1])

        # same test w/ Kmeans wrapper
        kmeans2 = faiss.Kmeans(d, k, progressive_dim_steps=5)
        kmeans2.train(xt)
        self.assertLess(kmeans2.obj[-1], kmeans.obj[-1])
Esempio n. 2
0
    def test_clustering(self):
        d = 64
        n = 1000
        rs = np.random.RandomState(123)
        x = rs.uniform(size=(n, d)).astype('float32')

        x *= 10

        km = faiss.Kmeans(d, 32, niter=10)
        err32 = km.train(x)

        # check that objective is decreasing
        prev = 1e50
        for o in km.obj:
            self.assertGreater(prev, o)
            prev = o

        km = faiss.Kmeans(d, 64, niter=10)
        err64 = km.train(x)

        # check that 64 centroids give a lower quantization error than 32
        self.assertGreater(err32, err64)

        km = faiss.Kmeans(d, 32, niter=10, int_centroids=True)
        err_int = km.train(x)

        # check that integer centoids are not as good as float ones
        self.assertGreater(err_int, err32)
        self.assertTrue(np.all(km.centroids == np.floor(km.centroids)))
Esempio n. 3
0
    def test_init(self):
        d = 32
        k = 5
        xt, xb, xq = get_dataset_2(d, 1000, 0, 0)
        km = faiss.Kmeans(d, k, niter=4)
        km.train(xt)

        km2 = faiss.Kmeans(d, k, niter=4)
        km2.train(xt, init_centroids=km.centroids)

        # check that the intial objective is better for km2 than km
        self.assertGreater(km.obj[0], km2.obj[0] * 1.01)
Esempio n. 4
0
    def test_kmeans(self):
        d = 32
        nb = 1000
        k = 10
        rs = np.random.RandomState(123)
        xb = rs.rand(nb, d).astype('float32')

        km1 = faiss.Kmeans(d, k)
        obj1 = km1.train(xb)

        km2 = faiss.Kmeans(d, k, gpu=True)
        obj2 = km2.train(xb)

        print(obj1, obj2)
        assert np.allclose(obj1, obj2)
Esempio n. 5
0
def get_clusters(dataset, num_clusters, model_type="resnet50_128", batch_size=64, n_batches=500):
    initcache = os.path.join(ROOT_DIR, 'centroids',
                             model_type + '_' + '_' + str(num_clusters) + '_desc_cen.hdf5')
    model = Net(model_type).to(device)
    batch_sampler = BalanceBatchSampler(dataset=dataset, n_classes=64, n_samples=1,
                                        n_batches_epoch=n_batches)
    data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_sampler=batch_sampler, num_workers=2)
    nDescriptors = batch_size * n_batches
    if not os.path.exists(os.path.join(ROOT_DIR, 'centroids')):
        os.makedirs(os.path.join(ROOT_DIR, 'centroids'))
    with h5py.File(initcache, mode='w') as h5:
        with torch.no_grad():
            model.eval()
            print('====> Extracting Descriptors')
            dbFeat = h5.create_dataset("descriptors",
                                       [nDescriptors, model.encoder_dim],
                                       dtype=np.float32)

            for iteration, (data, target, img_file, class_id) in enumerate(data_loader):
                data = data.to(device)
                idx = iteration * batch_size
                dbFeat[idx:idx + batch_size, :] = F.normalize(model(data), p=2, dim=1).cpu().numpy()

        print('====> Clustering..')
        niter = 100
        kmeans = faiss.Kmeans(model.encoder_dim, num_clusters, niter=niter, verbose=False)
        kmeans.train(dbFeat[...])

        print('====> Storing centroids', kmeans.centroids.shape)
        h5.create_dataset('centroids', data=kmeans.centroids)
        print('====> Done!')
Esempio n. 6
0
 def get_tag_embeddings(self, name, cluster_mode):
     if cluster_mode:
         cluster_items = self.get_tag_contents(name, True)
         if cluster_items is None:
             return None
         embeddings = []
         rel_db = self.related_database()
         for cluster_item in cluster_items:
             fix_idx = rel_db.i2b(cluster_item[0][0])
             face_id = s2b(cluster_item[0][1])
             embeddings.append(rel_db.get_face(fix_idx, face_id)['embedding'].reshape((512,)))
         return np.array(embeddings)
     if name not in self.tag_map or len(self.tag_map[name]) < 1:
         return None
     embeddings = []
     for _, _, _, embedding in self.tag_map[name]:
         embeddings.append(embedding)
     embeddings = np.array(embeddings)
     n_emb = embeddings.shape[0]
     if False and n_emb > 78*3: # FIXME: Disabled due to bugs
         kmeans = faiss.Kmeans(512, min(n_emb // 39, 150), niter=10, verbose=False)
         if name in self.cluster_map:
             kmeans = self.cluster_map[name]
         else:
             kmeans.train(embeddings)
             self.cluster_map[name] = kmeans
         if kmeans.centroids.shape[0] > 5:
             return kmeans.centroids
         else:
             return np.append(embeddings[1::3], kmeans.centroids, axis=0)
     return np.array(embeddings)
Esempio n. 7
0
def cluster_features_and_label(args: Namespace, cfg: AttrDict):
    # faiss is an optional dependency for VISSL.
    assert is_faiss_available(), (
        "Please install faiss using conda install faiss-gpu -c pytorch "
        "if using conda or pip install faiss-gpu"
    )
    import faiss

    cluster_backend = cfg.CLUSTERFIT.CLUSTER_BACKEND
    num_clusters = cfg.CLUSTERFIT.NUM_CLUSTERS
    data_split = cfg.CLUSTERFIT.FEATURES.DATA_PARTITION
    data_name = cfg.CLUSTERFIT.FEATURES.DATASET_NAME
    n_iter = cfg.CLUSTERFIT.N_ITER
    output_dir = get_checkpoint_folder(cfg)

    ########### Step 1: Extract the features on full dataset ###################
    feature_data, image_paths = get_data_features_and_images(cfg)

    ########### Step 2: Get the data information ###################
    features = feature_data["features"]
    # features are of shape num_samples x feature_dim
    assert features.ndim == 2, f"Features incorrect shape: {features.shape}"
    assert features.dtype == np.float32, "Features are not float32 type"
    logging.info(f"Clustering Features: {features.shape}")

    ########### Step 3: L2 normalize features ###################
    # TODO: we could support PCA here if needed in future.
    logging.info("L2 normalizing the features now...")
    feat_norm = np.linalg.norm(features, axis=1) + 1e-5
    features = features / feat_norm[:, np.newaxis]

    ########### Step 4: Cluster the features ###################
    logging.info("Clustering the features now...")
    assert cluster_backend == "faiss", "Only faiss clustering is supported currently"
    kmeans = faiss.Kmeans(features.shape[1], num_clusters, niter=n_iter, verbose=True)
    kmeans.train(features)
    centroids = kmeans.centroids

    ########### Step 5: Get the cluster assignment for the features ############
    logging.info("Getting cluster label assignment now...")
    distances, hard_cluster_labels = kmeans.index.search(features, 1)

    #### Step 6: Save clustering data and hard cluster labels for the images ###
    data_split = data_split.lower()
    clustering_output_dict = {
        "hard_labels": hard_cluster_labels,
        "centroids": centroids,
        "distances": distances,
    }
    cluster_output_filepath = (
        f"{output_dir}/{data_name}_{data_split}_N{num_clusters}_{cluster_backend}.pkl"
    )
    hard_labels_output_filepath = (
        f"{output_dir}/"
        f"{data_name}_{data_split}_N{num_clusters}_{cluster_backend}_lbls.npy"
    )
    out_hard_labels = np.array(hard_cluster_labels.tolist(), dtype=np.int64).reshape(-1)
    save_file(clustering_output_dict, cluster_output_filepath)
    save_file(out_hard_labels, hard_labels_output_filepath)
    logging.info("All Done!")
Esempio n. 8
0
def label_generator_kmeans(cfg,
                           features,
                           num_classes=500,
                           cuda=True,
                           **kwargs):

    assert cfg.TRAIN.PSEUDO_LABELS.cluster == "kmeans"
    assert num_classes, "num_classes for kmeans is null"

    # num_classes = cfg.TRAIN.PSEUDO_LABELS.cluster_num

    if not cfg.TRAIN.PSEUDO_LABELS.use_outliers:
        warnings.warn("there exists no outlier point by kmeans clustering")

    # k-means cluster by faiss
    cluster = faiss.Kmeans(features.size(-1),
                           num_classes,
                           niter=300,
                           verbose=True,
                           gpu=cuda)

    cluster.train(to_numpy(features))

    centers = to_torch(cluster.centroids).float()
    _, labels = cluster.index.search(to_numpy(features), 1)
    labels = labels.reshape(-1)
    labels = to_torch(labels).long()
    # k-means does not have outlier points
    assert not (-1 in labels)

    return labels, centers, num_classes, None
Esempio n. 9
0
    def __init__(self, feature_path, center_num, num_per_category):
        print('Start clustering...')
        self.image_paths, self.features = joblib.load(
            os.path.join(feature_path)
        )
        self.features = self.features.astype('float32')
        self.center_num = center_num
        d = self.features.shape[1]

        # construct kmeans using faiss
        self.kmeans = faiss.Kmeans(d, self.center_num, niter=100, verbose=True)
        self.kmeans.train(self.features)
        self.index = faiss.IndexFlatL2(d)
        self.index.add(self.features)

        # get the results
        # D contains the squared L2 distances
        # I contains the nearest neighbors for each centroid
        self.D, self.I = self.index.search(self.kmeans.centroids, num_per_category)

        self.img_paths = []
        self.scores = []
        # so we enumerate I, to get every centroid's neighbors
        for i, item in enumerate(self.I):
            temp_paths = []
            temp_scores = []
            for j, index in enumerate(item):
                temp_paths.append(self.image_paths[index])
                temp_scores.append(self.D[i][j])
            self.img_paths.append(temp_paths)
            self.scores.append(temp_scores)

        print('Job done.')
Esempio n. 10
0
def generate_vbow(k, descriptor_list, cat_dict, niter=100, nredo=10):
    # A k-means clustering algorithm who takes 2 parameter which is number of cluster(k) and the other is descriptors list(unordered 1d array)
    # Returns an array that holds central points.
    print(
        f"--- Creating VBoW with {k} words from a list of {len(descriptor_list)}---"
    )
    x = np.array(descriptor_list)
    shp = x.shape[1]
    kmeans = faiss.Kmeans(shp, k, niter=niter, verbose=True, nredo=nredo)
    kmeans.train(x)

    # Takes 2 parameters. The first one is a dictionary that holds the descriptors that are separated class by class
    # And the second parameter is an array that holds the central points (visual words) of the k means clustering
    # Returns a dictionary that holds the histograms for each images that are separated class by class.
    vbow_dict = dict()
    for cat_key, desc_list in cat_dict.items():
        cat_list = list()
        for d in desc_list:
            histogram = np.zeros(k)
            if d is not None:
                #To compute the mapping from a set of vectors x to the cluster centroids after kmeans has finished training, use:
                try:
                    dist, ind = kmeans.index.search(d, 1)
                    for i in ind:
                        histogram[i] += 1
                except Exception as e_d:
                    print(f"Erro ao processar d: {d}")
            cat_list.append(histogram)
        vbow_dict[cat_key] = cat_list
    return vbow_dict
Esempio n. 11
0
def run_kmeans(features, n_cluster):
    n_samples, dim = features.shape
    kmeans = faiss.Kmeans(dim, n_cluster)
    kmeans.n_iter, kmeans.min_points_per_centroid, kmeans.max_points_per_centroid = 20, 5, 1000000000
    kmeans.train(features)
    _, cluster_assignments = kmeans.index.search(features, 1)
    return cluster_assignments
def run_kmeans(vecs, ncentroids=10, niter=20, device=0, verbose=True):
    dim = vecs.shape[1]
    if device == -1:
        print("  On CPU")
        kmeans = faiss.Kmeans(dim, ncentroids, niter=niter, verbose=verbose)
        kmeans.train(vecs)
        distances, groups = kmeans.index.search(vecs, 1)
    else:
        print("  On GPU")
        if vecs.sum() == 0:
            msg = "All Image has no value. "
            msg += "Please retry with the other weight."
            print(msg)
        clus = faiss.Clustering(dim, ncentroids)
        clus.verbose = verbose
        clus.niter = niter
        clus.max_points_per_centroid = 10000000

        res = faiss.StandardGpuResources()
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = device

        index = faiss.GpuIndexFlatL2(res, dim, cfg)

        clus.train(vecs, index)
        distances, groups = index.search(vecs, 1)

    return groups
Esempio n. 13
0
def kmeans_trial(data: np.ndarray, nc: int) -> tuple:
    '''
    number of clusters -> (labels, silhouette coef)
    '''
    # [slow, sklearn]
    #centroids, labels, _ = k_means(data, nc)
    # [fast, faiss]
    npvecs = data.astype('float32')
    ncls = nc

    # [use cuda]
    #gpu_resource = faiss.StandardGpuResources()
    #cluster_idx = faiss.IndexFlatL2(npvecs.shape[1])
    #cluster_idx = faiss.index_cpu_to_gpu(gpu_resource, 0, cluster_idx)
    #kmeans = faiss.Clustering(npvecs.shape[1], ncls)
    #kmeans.verbose = False
    #kmeans.train(npvecs, cluster_idx)
    #_, pred = cluster_idx.search(npvecs, 1)
    #pred = pred.flatten()
    #labels = pred
    #centroids = np.array([kmeans.centroids.at(i) for i in
    #    range(kmeans.centroids.size())])
    # [use cpu]
    kmeans = faiss.Kmeans(npvecs.shape[1], ncls, seed=123, verbose=False,
            min_points_per_centroid=2,
            max_points_per_centroid=128)
    kmeans.train(npvecs)
    _, pred = kmeans.index.search(npvecs, 1)
    pred = pred.flatten()
    labels = pred
    centroids = kmeans.centroids

    silc = silhouette_score(data, labels)
    return (labels, centroids, silc)
Esempio n. 14
0
 def test_stats(self):
     d = 32
     k = 5
     xt, xb, xq = get_dataset_2(d, 1000, 0, 0)
     km = faiss.Kmeans(d, k, niter=4)
     km.train(xt)
     assert list(km.obj) == [st['obj'] for st in km.iteration_stats]
Esempio n. 15
0
def get_clusters(cluster_set):
    nDescriptors = 50000
    nPerImage = 100
    nIm = ceil(nDescriptors / nPerImage)

    sampler = SubsetRandomSampler(
        np.random.choice(len(cluster_set), nIm, replace=False))
    data_loader = DataLoader(dataset=cluster_set,
                             num_workers=opt.threads,
                             batch_size=opt.cacheBatchSize,
                             shuffle=False,
                             pin_memory=cuda,
                             sampler=sampler)

    if not exists(join(opt.dataPath, 'centroids')):
        makedirs(join(opt.dataPath, 'centroids'))

    initcache = join(
        opt.dataPath, 'centroids', opt.arch + '_' + cluster_set.dataset + '_' +
        str(opt.num_clusters) + '_desc_cen.hdf5')
    with h5py.File(initcache, mode='w') as h5:
        with torch.no_grad():
            model.eval()
            print('====> Extracting Descriptors')
            dbFeat = h5.create_dataset("descriptors",
                                       [nDescriptors, encoder_dim],
                                       dtype=np.float32)

            for iteration, (input, indices) in enumerate(data_loader, 1):
                input = input.to(device)
                image_descriptors = model.encoder(input).view(
                    input.size(0), encoder_dim, -1).permute(0, 2, 1)

                batchix = (iteration - 1) * opt.cacheBatchSize * nPerImage
                for ix in range(image_descriptors.size(0)):
                    # sample different location for each image in batch
                    sample = np.random.choice(image_descriptors.size(1),
                                              nPerImage,
                                              replace=False)
                    startix = batchix + ix * nPerImage
                    dbFeat[startix:startix + nPerImage, :] = image_descriptors[
                        ix, sample, :].detach().cpu().numpy()

                if iteration % 50 == 0 or len(data_loader) <= 10:
                    print("==> Batch ({}/{})".format(
                        iteration, ceil(nIm / opt.cacheBatchSize)),
                          flush=True)
                del input, image_descriptors

        print('====> Clustering..')
        niter = 100
        kmeans = faiss.Kmeans(encoder_dim,
                              opt.num_clusters,
                              niter=niter,
                              verbose=False)
        kmeans.train(dbFeat[...])

        print('====> Storing centroids', kmeans.centroids.shape)
        h5.create_dataset('centroids', data=kmeans.centroids)
        print('====> Done!')
 def fit(self, data):
     n_data, d = data.shape
     self.clus = faiss.Kmeans(d, self.n_clusters)
     self.clus.seed = np.random.randint(1234)
     self.clus.niter = 20
     self.clus.max_points_per_centroid = 10000000
     self.clus.train(data)
Esempio n. 17
0
def faiss_kmeans(infile, ncentroids, niter=20):
    """
    K-Means clustering with FAISS
    :param infile: Input file name
    :param ncentroids: desired number of clusters
    :param niter: maximum number of iterations
    :return: None
    """
    h5f = h5py.File(infile, 'r')
    x = h5f['fp_list'][:]
    smiles_list = h5f['smiles_list'][:]
    name_list = h5f['name_list'][:]
    h5f.close()

    verbose = True
    d = x.shape[1]
    kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose)
    kmeans.train(x)
    D, I = kmeans.index.search(x, 1)
    writer = csv.writer(open("detail.csv", "w"))
    writer.writerow(["SMILES", "NAME", "DIST", "CLUSTER"])
    for smiles, name, d, i in zip(smiles_list, name_list, [x[0] for x in D],
                                  [x[0] for x in I]):
        writer.writerow(
            [smiles[0].decode('utf-8'), name[0].decode('utf-8'), d, i])
    dist_list = get_centers(ncentroids, [x[0] for x in D], [x[0] for x in I])
    ofs = open("centers.smi", "w")
    for cluster_idx, (_, c) in enumerate(dist_list, 1):
        print(smiles_list[c][0].decode('utf-8'),
              name_list[c][0].decode('utf-8'),
              cluster_idx,
              file=ofs)
    ofs.close()
def run_kmeans_faiss(x: Union[np.ndarray, Tensor],
                     nmb_clusters: int,
                     n_iter: int,
                     cuda: bool,
                     verbose: bool = False) -> Tensor:
    if isinstance(x, torch.Tensor):
        x = x.numpy()
    x = np.reshape(x, (x.shape[0], -1))
    n_data, d = x.shape

    if cuda:
        # faiss implementation of k-means
        clus = faiss.Clustering(d, nmb_clusters)
        clus.niter = n_iter
        clus.max_points_per_centroid = 10000000
        clus.verbose = verbose
        res = faiss.StandardGpuResources()
        flat_config = faiss.GpuIndexFlatConfig()
        flat_config.useFloat16 = False
        index = faiss.GpuIndexFlatL2(res, d, flat_config)

        # perform the training
        clus.train(x, index)
        flat_config.device = 0
        _, I = index.search(x, 1)
    else:
        kmeans = faiss.Kmeans(d=d, k=nmb_clusters, verbose=verbose, niter=20)
        kmeans.train(x)
        _, I = kmeans.index.search(x, 1)

    I = torch.as_tensor(I, dtype=torch.long).squeeze()

    return I
Esempio n. 19
0
def kmeans(x, ncentroids, niter, verbose):
    print('kmeans')
    d = x.shape[1]
    kmeans = faiss.Kmeans(d, ncentroids, niter, verbose)
    kmeans.cp.max_points_per_centroid = 100000
    print('train kmeans')
    kmeans.train(x)
    return kmeans
Esempio n. 20
0
 def fit(self, X, y):
     self.kmeans = faiss.Kmeans(d=X.shape[1],
                                k=self.n_clusters,
                                niter=self.max_iter,
                                nredo=self.n_init)
     self.kmeans.train(X.astype(np.float32))
     self.cluster_centers_ = self.kmeans.centroids
     self.inertia_ = self.kmeans.obj[-1]
Esempio n. 21
0
 def test_1ptpercluster(self):
     # https://github.com/facebookresearch/faiss/issues/842
     X = np.random.randint(0, 1, (5, 10)).astype('float32')
     k = 5
     niter = 10
     verbose = True
     kmeans = faiss.Kmeans(X.shape[1], k, niter=niter, verbose=verbose)
     kmeans.train(X)
     l2_distances, I = kmeans.index.search(X, 1)
Esempio n. 22
0
def get_clusters(ftrain, nclusters):
    kmeans = faiss.Kmeans(ftrain.shape[1],
                          nclusters,
                          niter=100,
                          verbose=False,
                          gpu=False)
    kmeans.train(np.random.permutation(ftrain))
    _, ypred = kmeans.assign(ftrain)
    return ypred
Esempio n. 23
0
    def kmeans_train(self, vecs):
        import faiss

        kmeans = faiss.Kmeans(vecs.shape[1],
                              self.num_clusters,
                              niter=5,
                              verbose=False)
        kmeans.train(vecs)
        self.centroids = kmeans.centroids
Esempio n. 24
0
    def test_progressive_dim(self):
        d = 32
        n = 10000
        k = 50
        xt, _, _ = get_dataset_2(d, n, 0, 0)

        # basic kmeans
        kmeans = faiss.Kmeans(d, k, gpu=True)
        kmeans.train(xt)

        pca = faiss.PCAMatrix(d, d)
        pca.train(xt)
        xt_pca = pca.apply(xt)

        # same test w/ Kmeans wrapper
        kmeans2 = faiss.Kmeans(d, k, progressive_dim_steps=5, gpu=True)
        kmeans2.train(xt_pca)
        self.assertLess(kmeans2.obj[-1], kmeans.obj[-1])
Esempio n. 25
0
    def test_binary(self):
        ds = datasets.SyntheticDataset(128, 2000, 2000, 200)

        d = ds.d
        xt = ds.get_train()
        xq = ds.get_queries()
        xb = ds.get_database()

        # define alternative quantizer on the 20 first dims of vectors (will be in float)
        km = faiss.Kmeans(20, 50)
        km.train(xt[:, :20].copy())
        alt_quantizer = km.index

        binarizer = faiss.index_factory(d, "ITQ,LSHt")
        binarizer.train(xt)

        xb_bin = binarizer.sa_encode(xb)
        xq_bin = binarizer.sa_encode(xq)

        index = faiss.index_binary_factory(d, "BIVF200")

        fake_centroids = np.zeros((index.nlist, index.d // 8), dtype="uint8")
        index.quantizer.add(fake_centroids)
        index.is_trained = True

        # add elements xb
        a = alt_quantizer.search(xb[:, :20].copy(), 1)[1].ravel()
        ivf_tools.add_preassigned(index, xb_bin, a)

        # search elements xq, increase nprobe, check 4 first results w/ groundtruth
        prev_inter_perf = 0
        for nprobe in 1, 10, 20:

            index.nprobe = nprobe
            a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1]
            D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a)
            inter_perf = (I == ds.get_groundtruth()[:, :4]).sum() / I.size
            self.assertTrue(inter_perf >= prev_inter_perf)
            prev_inter_perf = inter_perf

        # test range search

        index.nprobe = 20

        a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1]

        # just to find a reasonable radius
        D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a)
        radius = int(D.max() + 1)

        lims, DR, IR = ivf_tools.range_search_preassigned(
            index, xq_bin, radius, a)

        # with that radius the k-NN results are a subset of the range search results
        for q in range(len(xq)):
            l0, l1 = lims[q], lims[q + 1]
            self.assertTrue(set(I[q]) <= set(IR[l0:l1]))
Esempio n. 26
0
    def test_nasty_clustering(self):
        d = 2
        rs = np.random.RandomState(123)
        x = np.zeros((100, d), dtype='float32')
        for i in range(5):
            x[i * 20:i * 20 + 20] = rs.uniform(size=d)

        # we have 5 distinct points but ask for 10 centroids...
        km = faiss.Kmeans(d, 10, niter=10, verbose=True)
        km.train(x)
Esempio n. 27
0
def find_anchors_(data_list, m):
    anchor_list = []
    for X in data_list:
        X = X.astype(np.float32)
        n, d = X.shape
        kmeans = faiss.Kmeans(d, m, niter=20, verbose=False)
        kmeans.train(X)
        anchors = kmeans.centroids
        anchor_list.append(anchors)
    return anchor_list
Esempio n. 28
0
def find_anchors(data_list, m):
    anchor_list = []
    for i, X in enumerate(data_list):
        X = X.astype(np.float32)
        d = X.shape[1]
        kmeans = faiss.Kmeans(d, m[i], niter=20, verbose=False)
        kmeans.train(X)
        anchors = kmeans.centroids
        anchor_list.append(anchors)
    return anchor_list
Esempio n. 29
0
def kMeans(data, numCluster, useGpu=True):
    kFunc = faiss.Kmeans(data.shape[1], numCluster, gpu=useGpu)
    kFunc.cp.max_points_per_centroid = ((data.shape[0] + numCluster - 1) //
                                        numCluster)
    if data.is_cuda:
        data = data.cpu()
    kFunc.train(data.numpy())
    # assign labels
    _, labels = kFunc.index.search(data.numpy(), 1)
    return kFunc.centroids, labels.squeeze()
Esempio n. 30
0
 def train_kmeans(self, vecs):
     import faiss
     kmeans_instance = faiss.Kmeans(self.vec_dim,
                                    self.kmeans_clusters,
                                    niter=10)
     if vecs.dtype != np.float32:
         vecs = vecs.astype(np.float32)
     kmeans_instance.train(vecs)
     centroids = kmeans_instance.centroids
     return centroids