Example #1
0
def make_graph(xb, nnn):
    """Builds a graph of nearest neighbors.
    Args:
        xb (np.array): data
        nnn (int): number of nearest neighbors
    Returns:
        list: for each data the list of ids to its nnn nearest neighbors
        list: for each data the list of distances to its nnn NN
    """
    N, dim = xb.shape

    # we need only a StandardGpuResources per GPU
    res = faiss.StandardGpuResources()

    # L2
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.device = int(torch.cuda.device_count()) - 1
    index = faiss.GpuIndexFlatL2(res, dim, flat_config)
    index.add(xb)
    D, I = index.search(xb, nnn + 1)
    return I, D
def _faiss_knn(X,k, mode='mut', inner_prod = False):
    # kNN search for the graph
    X  = np.ascontiguousarray(X)
    
    d = X.shape[1]
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.device = 0
    if inner_prod:
        faiss.normalize_L2(X)
        index =  faiss.GpuIndexFlatIP(res,d,flat_config)         
    else:
        index = faiss.GpuIndexFlatL2(res,d,flat_config)   # build the index
    #normalize_L2(X)
    index.add(X) 
    N = X.shape[0]
    Nidx = index.ntotal

    c = time.time()
    D, I = index.search(X, k + 1)
    elapsed = time.time() - c
    LOG.info(('kNN Search done in %d seconds'.format(elapsed)),LOG.ll.UTILS)



    # Create the graph
    D = np.sqrt(D[:,1:])
    
    
    
    I = I[:,1:]
    row_idx = np.arange(N)
    row_idx_rep = np.tile(row_idx,(k,1)).T
    W = scipy.sparse.csr_matrix((D.flatten('F'), (row_idx_rep.flatten('F'), I.flatten('F'))), shape=(N, N))
    
    
    W =  __symmetrize_KNN(W,mode=mode)

    return W
Example #3
0
def run_kmeans(x,
               nmb_clusters,
               verbose=False,
               seed=DEFAULT_KMEANS_SEED,
               gpu_device=0):
    """
    Runs kmeans on 1 GPU.
    
    Args:
    -----
    x: data
    nmb_clusters (int): number of clusters
    
    Returns:
    --------
    list: ids of data in each cluster
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    clus.seed = seed
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = gpu_device

    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)
    losses = faiss.vector_to_array(clus.obj)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in I], losses[-1]
Example #4
0
def train_kmeans(x, k, ngpu, max_points_per_centroid=256):
    "Runs kmeans on one or several GPUs"
    d = x.shape[1]
    clus = faiss.Clustering(d, k)
    clus.verbose = True
    clus.niter = 20
    clus.max_points_per_centroid = max_points_per_centroid

    if ngpu == 0:
        index = faiss.IndexFlatL2(d)
    else:
        res = [faiss.StandardGpuResources() for i in range(ngpu)]

        flat_config = []
        for i in range(ngpu):
            cfg = faiss.GpuIndexFlatConfig()
            cfg.useFloat16 = False
            cfg.device = i
            flat_config.append(cfg)

        if ngpu == 1:
            index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
        else:
            indexes = [
                faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                for i in range(ngpu)
            ]
            index = faiss.IndexReplicas()
            for sub_index in indexes:
                index.addIndex(sub_index)

    # perform the training
    clus.train(x, index)
    centroids = faiss.vector_float_to_array(clus.centroids)

    obj = faiss.vector_float_to_array(clus.obj)
    print "final objective: %.4g" % obj[-1]

    return centroids.reshape(k, d)
Example #5
0
def run_kmeans(x, nmb_clusters, verbose=False):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        (list: ids of data in each cluster, float: loss value)
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)

    # Change faiss seed at each k-means so that the randomly picked
    # initialization centroids do not correspond to the same feature ids
    # from an epoch to another.
    clus.seed = np.random.randint(1234)

    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)

    # losses = faiss.vector_to_array(clus.obj)

    stats = clus.iteration_stats
    losses = np.array([stats.at(i).obj for i in range(stats.size())])

    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in I], losses[-1]
def main():
    dirname = os.path.dirname(__file__)
    output_dir = os.path.join(dirname, 'features')

    train_ims = load_h5('train_ims', os.path.join(output_dir, 'trainIms.h5'))
    train_classes = load_h5('train_classes',
                            os.path.join(output_dir, 'trainClasses.h5'))
    train_feats = load_h5('train_feats',
                          os.path.join(output_dir, 'trainFeats.h5'))

    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.device = 3  # specify which GPU to use

    gpu_index = faiss.GpuIndexFlatIP(res, train_feats.shape[1], flat_config)
    for feat in train_feats:
        gpu_index.add(np.expand_dims(feat, 0))

    csv_dir = os.path.join(dirname, 'csv_output')
    if not os.path.exists(csv_dir):
        os.makedirs(csv_dir)

    occlusion_levels = [
        'unoccluded', 'low_occlusions', 'medium_occlusions', 'high_occlusions'
    ]
    for occlusion in occlusion_levels:
        with open(os.path.join(csv_dir, occlusion + '.csv'), 'wb') as csv_file:
            test_output_dir = os.path.join(output_dir, occlusion)
            test_ims = load_h5('test_ims',
                               os.path.join(test_output_dir, 'testIms.h5'))
            test_feats = load_h5('test_feats',
                                 os.path.join(test_output_dir, 'testFeats.h5'))
            for imId, ft in zip(test_ims, test_feats):
                result_dists, result_inds = gpu_index.search(
                    np.expand_dims(ft, 0).astype('float32'), 100)
                result_im_inds = train_ims[result_inds[0]]
                csv_line = str(imId) + ',' + ','.join(
                    [str(r) for r in result_im_inds]) + '\n'
                csv_file.writelines(csv_line)
def get_idxs_and_dists(query_features, index_features, BS=32):
    import faiss
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.device = 0
    res = faiss.StandardGpuResources()
    co = faiss.GpuClonerOptions()
    FEAT_DIM = index_features.shape[1]
    cpu_index = faiss.IndexFlatL2(FEAT_DIM)
    cpu_index.add(index_features)
    index = faiss.index_cpu_to_gpu(res, 0, cpu_index, co)
    out_dists = np.zeros((len(query_features), 100), dtype=np.float32)
    out_idxs = np.zeros((len(query_features), 100), dtype=np.int32)
    NUM_QUERY = len(query_features)
    for ind in progress_bar(range(0, len(query_features), BS)):
        fin = ind + BS
        if fin > NUM_QUERY:
            fin = NUM_QUERY
        q_descs = query_features[ind:fin]
        D, I = index.search(q_descs, 100)
        out_dists[ind:fin] = D
        out_idxs[ind:fin] = I
    return out_idxs, out_dists
Example #8
0
def get_nn_avg_dist(src, tgt, knn):
    """
    Compute the average distance of the `knn` nearest neighbors
    for a given set of embeddings and queries.
    Use Faiss if available.
    """
    #print(FAISS_AVAILABLE)
    #if FAISS_AVAILABLE:        
    if hasattr(faiss, 'StandardGpuResources'):
        # gpu mode
        res = faiss.StandardGpuResources()
        config = faiss.GpuIndexFlatConfig()
        config.device = 0
        index = faiss.GpuIndexFlatIP(res, tgt.shape[1], config)
        logger.info("faiss gpu mode!")
    else:
        # cpu mode
        index = faiss.IndexFlatIP(tgt.shape[1])
    index.add(src)
    distances, _ = index.search(src, knn)
    return distances.mean(1)
    """
Example #9
0
def get_knn(inst_embeddings, label_embeddings, accelerator, top_k=100, bsz=65536):
	accelerator.print("FAISS")
	# logging.info("FAISS indexer building")
	res = faiss.StandardGpuResources()
	flat_config = faiss.GpuIndexFlatConfig()
	flat_config.useFloat16 = False
	flat_config.device = accelerator.local_process_index
	indexer = faiss.GpuIndexFlatIP(res, inst_embeddings.shape[1], flat_config)
	indexer.add(label_embeddings)
	# logging.info("FAISS indexer searching")
	num_inst = inst_embeddings.shape[0]
	nr_batch = int(math.ceil(num_inst / bsz))
	D_list, I_list = [], []
	accelerator.print("index")
	for bidx in tqdm(range(nr_batch)):
		sidx = bidx * bsz
		eidx = min((bidx + 1) * bsz, num_inst)
		D, I = indexer.search(inst_embeddings[sidx:eidx], top_k)
		D_list.append(D)
		I_list.append(I)
	D = np.concatenate(D_list)
	I = np.concatenate(I_list)
	return D, I
Example #10
0
def get_index(feat_dim=384, gpus='0'):
    # feat数据存储在这里面. 数据量巨大时,容易爆显存
    os.environ['CUDA_VISIBLE_DEVICES'] = gpus
    # print(os.environ['CUDA_VISIBLE_DEVICES'])
    if gpus == '':
        ngpus = 0
    else:
        ngpus = len(os.environ['CUDA_VISIBLE_DEVICES'].split(','))
    if ngpus == 0:
        cpu_index = faiss.IndexFlatL2(feat_dim)
        gpu_index = cpu_index
    elif ngpus == 1:
        flat_config = faiss.GpuIndexFlatConfig()
        flat_config.device = 0  # int(gpus[0])
        res = faiss.StandardGpuResources()
        gpu_index = faiss.GpuIndexFlatL2(res, feat_dim,
                                         flat_config)  # use one gpu.  初始化很慢
    else:
        # print('use all gpu')
        cpu_index = faiss.IndexFlatL2(feat_dim)
        gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)  # use all gpus
    index = gpu_index
    return index
Example #11
0
def cluster(features, num_cluster):
    n_samples, dim = features.shape

    kmeans_clustering = faiss.Clustering(dim, num_cluster)
    kmeans_clustering.n_iter = 20
    kmeans_clustering.max_points_per_centroid = 1000000000

    gpu_resource = faiss.StandardGpuResources()
    gpu_flat = faiss.GpuIndexFlatConfig()
    gpu_flat.useFloat16 = False
    gpu_flat.device = 0

    gpu_distance_measure = faiss.GpuIndexFlatL2(gpu_resource, dim, gpu_flat)

    kmeans_clustering.train(features, gpu_distance_measure)
    _, cluster_idxs = gpu_distance_measure.search(features, 1)
    losses = faiss.vector_to_array(kmeans_clustering.obj)

    image_list = [[] for i in range(num_cluster)]
    for i in range(len(features)):
        image_list[cluster_idxs[i][0]].append(i)

    return image_list, losses[-1]
Example #12
0
def run_kmeans(x, nmb_clusters, verbose=False):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 10
    clus.max_points_per_centroid = 10000000
    clus.verbose = True
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)
    centroids = np.array(faiss.vector_to_array(clus.centroids))
    centroids = np.reshape(centroids, (-1, 64))
    centd = pdist(centroids)
    centroids2d = TSNE(n_components=2).fit_transform(centroids)
    plt.scatter(centroids2d[:, 0], centroids2d[:, 1])
    axes = plt.gca()
    axes.set_xlim([-50, 50])
    axes.set_ylim([-50, 50])
    losses = faiss.vector_to_array(clus.obj)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in I], losses[-1], plt, np.mean(centd)
Example #13
0
def run_kmeans(x, nmb_clusters, verbose=False):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.nredo = 10

    clus.max_points_per_centroid = int(3 * (n_data / nmb_clusters))  #10000000
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # import pdb; pdb.set_trace()

    # perform the training
    clus.train(x, index)

    D, I = index.search(x, 1)
    losses = faiss.vector_to_array(clus.obj)

    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    # import pdb; pdb.set_trace()

    return [int(n[0]) for n in I], [float(n[0]) for n in D
                                    ], losses[-1], clus, index, flat_config
Example #14
0
    def test_IndexIVFPQ(self):
        (xt, xb, xq) = self.get_dataset()
        d = xt.shape[1]

        dev_no = 0
        usePrecomputed = True

        res = faiss.StandardGpuResources()

        flat_config = faiss.GpuIndexFlatConfig()
        flat_config.device = dev_no

        gt_index = faiss.GpuIndexFlatL2(res, d, flat_config)
        gt_index.add(xb)
        D, gt_nns = gt_index.search(xq, 1)

        coarse_quantizer = faiss.IndexFlatL2(d)
        ncentroids = int(np.sqrt(xb.shape[0])) * 4

        index = faiss.IndexIVFPQ(coarse_quantizer, d, ncentroids, 32, 8)
        # add implemented on GPU but not train
        index.train(xt)

        ivfpq_config = faiss.GpuIndexIVFPQConfig()
        ivfpq_config.device = dev_no
        ivfpq_config.usePrecomputedTables = usePrecomputed

        gpuIndex = faiss.GpuIndexIVFPQ(res, index, ivfpq_config)
        gpuIndex.setNumProbes(64)
        index.add(xb)

        D, nns = index.search(xq, 10)
        n_ok = (nns == gt_nns).sum()
        nq = xq.shape[0]
        print ncentroids, n_ok, nq

        self.assertGreater(n_ok, nq * 0.2)
Example #15
0
def search_and_on(idx,base,query,now_list):
  base = base.astype(np.float32)
  query = query.astype(np.float32)
  # we need only a StandardGpuResources per GPU
  res = faiss.StandardGpuResources()

  flat_config = faiss.GpuIndexFlatConfig()
  flat_config.device = 0

  index = faiss.GpuIndexFlatIP(res, d, flat_config)
  for nnn in range(1):
    index.add(base)
  D, I = index.search(query, k)
  I = I.tolist()[0]
  #to map to ori idx,similar is the indx for similar images map to original idx,left_base
  #is the next base for search,left_list is the next list for chose query
  similar = []
  for i in range(len(I)):
    idx_this = now_list[I[i]]
    similar.append(idx_this)
  similar.append(idx)
  left_list = [x for x in now_list if x not in similar]
  left_base = np.delete(base, I, axis=0)
  return similar, left_base, left_list
    def _set_block_index(self):
        """Create a Faiss Flat index with inner product as the metric to search against"""
        try:
            import faiss
        except ImportError:
            raise Exception(
                "Error: Please install faiss to use FaissMIPSIndex")

        if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
            print("\n> Building index", flush=True)
        self.block_mips_index = faiss.index_factory(self.embed_size, 'Flat',
                                                    faiss.METRIC_INNER_PRODUCT)

        if self.use_gpu:
            # create resources and config for GpuIndex
            res = faiss.StandardGpuResources()
            config = faiss.GpuIndexFlatConfig()
            config.device = torch.cuda.current_device()
            config.useFloat16 = True

            self.block_mips_index = faiss.GpuIndexFlat(res,
                                                       self.block_mips_index,
                                                       config)
            if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
                print(">> Initialized index on GPU {}".format(
                    self.block_mips_index.getDevice()),
                      flush=True)
        else:
            # CPU index supports IDs so wrap with IDMap
            self.block_mips_index = faiss.IndexIDMap(self.block_mips_index)
            if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
                print(">> Initialized index on CPU", flush=True)

        # if we were constructed with a BlockData, then automatically load it when the FAISS structure is built
        if self.block_data is not None:
            self.add_block_embed_data(self.block_data)
Example #17
0
def get_faiss_nearest_neighbours(emb_src, emb_wrt, k, gpu_device=0):
    """
    Gets source points'/embeddings' nearest neighbours with respect to a set of target embeddings.
    inputs:
        :param emb_src (np.ndarray) : the source embedding matrix
        :param emb_wrt (np.ndarray) : the embedding matrix in which nearest neighbours are to be found
        :param k (int) : the number of nearest neightbours to find
        :param use_gpu (bool) : true if the gpu is to be used
        :param gpu_device (int) : the GPU to be used
    outputs:
        :returns distance (np.ndarray) : [len(emb_src), k] matrix of distance of each source point to each of its k 
            nearest neighbours
        :returns indices (np.ndarray) : [len(emb_src), k] matrix of indices of each source point to each of its k 
            nearest neighbours
    """
    if gpu_device >= 0:
        res = faiss.StandardGpuResources()
        cfg = faiss.GpuIndexFlatConfig()
        cfg.device = gpu_device
        index = faiss.GpuIndexFlatIP(res, emb_wrt.shape[1], cfg)
    else:
        index = faiss.IndexFlatIP(emb_wrt.shape[1])
    index.add(emb_wrt.astype('float32'))
    return index.search(emb_src.astype('float32'), k)
Example #18
0
    def run_kmeans(x, nmb_clusters):
        n_data, d = x.shape

        # faiss implementation of k-means
        clus = faiss.Clustering(d, nmb_clusters)

        # Change faiss seed at each k-means so that the randomly picked
        # initialization centroids do not correspond to the same feature ids
        # from an epoch to another.
        clus.seed = np.random.randint(1234)

        clus.niter = 20
        clus.max_points_per_centroid = 10000000
        res = faiss.StandardGpuResources()
        flat_config = faiss.GpuIndexFlatConfig()
        flat_config.useFloat16 = False
        flat_config.device = 0
        index = faiss.GpuIndexFlatL2(res, d, flat_config)

        # perform the training
        clus.train(x, index)
        _, I = index.search(x, 1)

        return [int(n[0]) for n in I]
def get_nn_avg_dist(emb, query, knn):
    """
    Compute the average distance of the `knn` nearest neighbors
    for a given set of embeddings and queries.
    Use Faiss if available.
    """
    if FAISS_AVAILABLE:
        emb = emb.cpu().numpy()
        query = query.cpu().numpy()
        if hasattr(faiss, 'StandardGpuResources'):
            # gpu mode
            res = faiss.StandardGpuResources()
            config = faiss.GpuIndexFlatConfig()
            config.device = 0
            index = faiss.GpuIndexFlatIP(res, emb.shape[1], config)
        else:
            # cpu mode
            index = faiss.IndexFlatIP(emb.shape[1])
        index.add(emb)
        distances, _ = index.search(query, knn)
        return distances.mean(1)
    else:
        print("Will not run this, too slow without Faiss")
        return 0
        bs = 1024
        all_distances = []
        emb = emb.transpose(0, 1).contiguous()
        for i in range(0, query.shape[0], bs):
            distances = query[i:i + bs].mm(emb)
            best_distances, _ = distances.topk(knn,
                                               dim=1,
                                               largest=True,
                                               sorted=True)
            all_distances.append(best_distances.mean(1).cpu())
        all_distances = torch.cat(all_distances)
        return all_distances.numpy()
Example #20
0
def test_knn_search(size=10000, gpu_id=None):
    x = np.random.rand(size, 512)
    x = x.reshape(x.shape[0], -1).astype('float32')
    d = x.shape[1]

    tic = time.time()
    if gpu_id is None:
        index = faiss.IndexFlatL2(d)
    else:
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = gpu_id

        flat_config = [cfg]
        resources = [faiss.StandardGpuResources()]
        index = faiss.GpuIndexFlatL2(resources[0], d, flat_config[0])
    index.add(x)
    print('Index built in {} sec'.format(time.time() - tic))
    distances, I = index.search(x, 21)
    print('Searched in {} sec'.format(time.time() - tic))
    print(distances.shape)
    print(I.shape)
    print(distances[:5])
    print(I[:5])
Example #21
0
 def _faiss_index_flat(self, dim):
     """Return initialized faiss.GpuIndexFlatL2"""
     res = faiss.StandardGpuResources()
     flat_config = faiss.GpuIndexFlatConfig()
     flat_config.device = self.gpu_id
     return faiss.GpuIndexFlatL2(res, dim, flat_config)
Example #22
0
def main(args):
    paddle.seed(12345)

    # load config
    config = load_yaml(args.config_yaml)
    config["config_abs_dir"] = args.abs_dir
    # load static model class
    dy_model_class = load_dy_model_class(config)

    use_gpu = config.get("runner.use_gpu", True)
    test_data_dir = config.get("runner.test_data_dir", None)
    print_interval = config.get("runner.print_interval", None)
    model_load_path = config.get("runner.infer_load_path", "model_output")
    start_epoch = config.get("runner.infer_start_epoch", 0)
    end_epoch = config.get("runner.infer_end_epoch", 10)
    batch_size = config.get("runner.infer_batch_size", None)
    os.environ["CPU_NUM"] = str(config.get("runner.thread_num", 1))

    logger.info("**************common.configs**********")
    logger.info(
        "use_gpu: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}".
        format(use_gpu, test_data_dir, start_epoch, end_epoch, print_interval,
               model_load_path))
    logger.info("**************common.configs**********")

    place = paddle.set_device('gpu' if use_gpu else 'cpu')

    dy_model = dy_model_class.create_model(config)
    test_dataloader = create_data_loader(
        config=config, place=place, mode="test")

    logger.info("read data")

    epoch_begin = time.time()
    interval_begin = time.time()

    for epoch_id in range(start_epoch, end_epoch):
        logger.info("load model epoch {}".format(epoch_id))
        model_path = os.path.join(model_load_path, str(epoch_id))
        load_model(model_path, dy_model)
        b = dy_model.item_emb.weight.numpy()

        import faiss
        if use_gpu:
            res = faiss.StandardGpuResources()
            flat_config = faiss.GpuIndexFlatConfig()
            flat_config.device = 0
            faiss_index = faiss.GpuIndexFlatIP(res, b.shape[-1], flat_config)
            faiss_index.add(b)
        else:
            faiss_index = faiss.IndexFlatIP(b.shape[-1])
            faiss_index.add(b)

        total = 1
        total_recall = 0.0
        total_ndcg = 0.0
        total_hitrate = 0

        for batch_id, batch_data in enumerate(test_dataloader()):

            user_embs, _ = dy_model_class.infer_forward(dy_model, None,
                                                        batch_data, config)

            user_embs = user_embs.numpy()
            target_items = np.squeeze(batch_data[-1].numpy(), axis=1)

            if len(user_embs.shape) == 2:
                D, I = faiss_index.search(user_embs, args.top_n)
                for i, iid_list in enumerate(target_items):
                    recall = 0
                    dcg = 0.0
                    item_list = set(I[i])
                    iid_list = list(filter(lambda x: x != 0, list(iid_list)))
                    for no, iid in enumerate(iid_list):
                        if iid in item_list:
                            recall += 1
                            dcg += 1.0 / math.log(no + 2, 2)
                    idcg = 0.0
                    for no in range(recall):
                        idcg += 1.0 / math.log(no + 2, 2)
                    total_recall += recall * 1.0 / len(iid_list)
                    if recall > 0:
                        total_ndcg += dcg / idcg
                        total_hitrate += 1
            else:
                ni = user_embs.shape[1]
                user_embs = np.reshape(user_embs, [-1, user_embs.shape[-1]])
                D, I = faiss_index.search(user_embs, args.top_n)
                for i, iid_list in enumerate(target_items):
                    recall = 0
                    dcg = 0.0
                    item_list_set = set()
                    item_list = list(
                        zip(
                            np.reshape(I[i * ni:(i + 1) * ni], -1),
                            np.reshape(D[i * ni:(i + 1) * ni], -1)))
                    item_list.sort(key=lambda x: x[1], reverse=True)
                    for j in range(len(item_list)):
                        if item_list[j][0] not in item_list_set and item_list[
                                j][0] != 0:
                            item_list_set.add(item_list[j][0])
                            if len(item_list_set) >= args.top_n:
                                break
                    iid_list = list(filter(lambda x: x != 0, list(iid_list)))
                    for no, iid in enumerate(iid_list):
                        if iid == 0:
                            break
                        if iid in item_list_set:
                            recall += 1
                            dcg += 1.0 / math.log(no + 2, 2)
                    idcg = 0.0
                    for no in range(recall):
                        idcg += 1.0 / math.log(no + 2, 2)

                    total_recall += recall * 1.0 / len(iid_list)
                    if recall > 0:
                        total_ndcg += dcg / idcg
                        total_hitrate += 1
            total += target_items.shape[0]

            if batch_id % print_interval == 0:
                recall = total_recall / total
                ndcg = total_ndcg / total
                hitrate = total_hitrate * 1.0 / total
                metric_str = ""
                metric_str += "recall@%d: %.5f, " % (args.top_n, recall)
                metric_str += "ndcg@%d: %.5f, " % (args.top_n, ndcg)
                metric_str += "hitrate@%d: %.5f, " % (args.top_n, hitrate)
                logger.info("epoch: {}, batch_id: {}, ".format(
                    epoch_id, batch_id) + metric_str + "speed: {:.2f} ins/s".
                            format(print_interval * batch_size / (time.time(
                            ) - interval_begin)))

        recall = total_recall / total
        ndcg = total_ndcg / total
        hitrate = total_hitrate * 1.0 / total
        metric_str = ""
        metric_str += "recall@%d: %.5f, " % (args.top_n, recall)
        metric_str += "ndcg@%d: %.5f, " % (args.top_n, ndcg)
        metric_str += "hitrate@%d: %.5f, " % (args.top_n, hitrate)

        logger.info("epoch: {} done, ".format(epoch_id) + metric_str +
                    "epoch time: {:.2f} s".format(time.time() - epoch_begin))
Example #23
0
def main(pretrained_net, whichGPU):
    if not 'ilsvrc2012' in pretrained_net:
        iterStr = pretrained_net.split('-')[-1]
        splitStr = pretrained_net.split('/')
        output_dir = os.path.join(
            '/'.join(splitStr[:np.where(np.array(splitStr) == 'ckpts')[0][0]]),
            'results_small', iterStr)
    else:
        iterStr = 'ilsvrc2012'
        output_dir = os.path.join('./output/ilsvrc2012/results_small', iterStr)

    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.device = int(whichGPU)

    train_feats = load_h5('train_feats',
                          os.path.join(output_dir, 'trainFeats.h5'))
    train_classes = load_h5('train_classes',
                            os.path.join(output_dir, 'trainClasses.h5'))
    train_ims = load_h5('train_ims', os.path.join(output_dir, 'trainIms.h5'))
    gpu_index = faiss.GpuIndexFlatIP(res, train_feats.shape[1], flat_config)
    for feat in train_feats:
        gpu_index.add(np.expand_dims(feat, 0))

    test_datasets = [
        './input/test/small_test_by_hotel.txt',
        './input/occluded_test_small/by_hotel/0.txt',
        './input/occluded_test/by_hotel_small/1.txt',
        './input/occluded_test_small/by_hotel/2.txt',
        './input/occluded_test_small/by_hotel/3.txt'
    ]
    test_names = [
        'by_hotel', 'occluded0', 'occluded1', 'occluded2', 'occluded3'
    ]
    for test_dataset, test_name in zip(test_datasets, test_names):
        test_output_dir = os.path.join(output_dir, test_name)
        test_feats = load_h5('test_feats',
                             os.path.join(test_output_dir, 'testFeats.h5'))
        test_ims = load_h5('test_ims',
                           os.path.join(test_output_dir, 'testIms.h5'))
        test_classes = load_h5('test_classes',
                               os.path.join(test_output_dir, 'testClasses.h5'))
        unique_classes = np.unique(train_classes)
        unique_classes_sorted = np.argsort(unique_classes)
        classification_scores = np.zeros(
            (test_feats.shape[0], unique_classes.shape[0]))
        for aa in range(0, test_feats.shape[0], 100):
            # print aa, ' out of ', test_feats.shape[0]
            ff = test_feats[aa:aa + 100, :]
            result_dists, result_inds = gpu_index.search(
                ff.astype('float32'), 1000)
            row_sums = result_dists.sum(axis=1)
            result_dists_normalized = result_dists / row_sums[:, np.newaxis]
            result_classes = train_classes[result_inds]
            resultInfo = [[
                unique_classes_sorted[np.searchsorted(
                    unique_classes[unique_classes_sorted],
                    np.unique(r, return_index=True)[0])],
                d[np.unique(r, return_index=True)[1]]
            ] for r, d in zip(result_classes, result_dists_normalized)]
            for idx in range(len(resultInfo)):
                classification_scores[aa + idx,
                                      resultInfo[idx][0]] = resultInfo[idx][1]
        sorted_classes = np.zeros(
            (test_feats.shape[0], unique_classes.shape[0]))
        for idx in range(test_feats.shape[0]):
            # print idx, ' out of ', test_feats.shape[0]
            sorted_classes[idx, :] = np.argsort(-classification_scores[idx])
        correct_cls_to_unique_ind = unique_classes_sorted[np.searchsorted(
            unique_classes[unique_classes_sorted], test_classes)]
        top_k = np.zeros((test_feats.shape[0], unique_classes.shape[0]))
        for idx in range(test_feats.shape[0]):
            topResult = np.where(
                sorted_classes[idx] == correct_cls_to_unique_ind[idx])[0][0]
            top_k[idx, topResult:] = 1
        average_accuracy = np.mean(top_k, axis=0)
        # save_h5('top_ims',top_ims,'i8',os.path.join(test_output_dir,'top_ims.h5'))
        save_h5('top_k', top_k, 'f', os.path.join(test_output_dir, 'top_k.h5'))
        save_h5('average_accuracy', average_accuracy, 'f',
                os.path.join(test_output_dir, 'average_accuracy.h5'))
        # print iterStr, test_name, average_accuracy[0], average_accuracy[9], average_accuracy[99]
        print '%s, %s, %0.2f, %0.2f, %0.2f' % (
            iterStr, test_name, 100. * average_accuracy[0],
            100. * average_accuracy[9], 100. * average_accuracy[99])

    import json
    jsonTestData = json.load(open('./input/test_set.json'))
    jsonTrainData = json.load(open('./input/train_set.json'))

    cls_to_chain = {}
    for hotel in jsonTrainData.keys():
        if jsonTrainData[hotel]['chainId'] != -1:
            cls_to_chain[int(hotel)] = jsonTrainData[hotel]['chainId']

    for hotel in jsonTestData.keys():
        if jsonTestData[hotel]['chainId'] != -1 and int(
                hotel) not in cls_to_chain.keys():
            cls_to_chain[int(hotel)] = jsonTestData[hotel]['chainId']

    by_chain_inds = np.where(
        np.in1d(train_classes, cls_to_chain.keys()) == True)[0]

    del gpu_index

    train_feats2 = train_feats[by_chain_inds, :]
    train_classes2 = train_classes[by_chain_inds]
    train_ims2 = train_ims[by_chain_inds]

    train_class_to_chain = np.array(
        [cls_to_chain[cls] for cls in train_classes2])

    gpu_index = faiss.GpuIndexFlatIP(res, train_feats2.shape[1], flat_config)
    for feat in train_feats2:
        gpu_index.add(np.expand_dims(feat, 0))

    test_datasets = [
        './input/test/small_test_by_chain.txt',
        './input/occluded_test_small/by_chain/0.txt',
        './input/occluded_test_small/by_chain/1.txt',
        './input/occluded_test_small/by_chain/2.txt',
        './input/occluded_test_small/by_chain/3.txt'
    ]
    test_names = [
        'by_chain', 'by_chain_occluded0', 'by_chain_occluded1',
        'by_chain_occluded2', 'by_chain_occluded3'
    ]
    for test_dataset, test_name in zip(test_datasets, test_names):
        test_output_dir = os.path.join(output_dir, test_name)
        test_feats = load_h5('test_feats',
                             os.path.join(test_output_dir, 'testFeats.h5'))
        test_ims = load_h5('test_ims',
                           os.path.join(test_output_dir, 'testIms.h5'))
        test_classes = load_h5('test_classes',
                               os.path.join(test_output_dir, 'testClasses.h5'))
        test_class_to_chain = np.array(
            [cls_to_chain[cls] for cls in test_classes])
        unique_chains = np.unique(train_class_to_chain)
        unique_chains_sorted = np.argsort(unique_chains)
        chain_classification_scores = np.zeros(
            (test_feats.shape[0], unique_chains.shape[0]))
        for aa in range(0, test_feats.shape[0], 100):
            # print aa, ' out of ', test_feats.shape[0]
            ff = test_feats[aa:aa + 100, :]
            result_dists, result_inds = gpu_index.search(
                ff.astype('float32'), 1000)
            row_sums = result_dists.sum(axis=1)
            result_dists_normalized = result_dists / row_sums[:, np.newaxis]
            result_chains = train_class_to_chain[result_inds]
            resultInfo = [[
                unique_chains_sorted[np.searchsorted(
                    unique_chains[unique_chains_sorted],
                    np.unique(r, return_index=True)[0])],
                d[np.unique(r, return_index=True)[1]]
            ] for r, d in zip(result_chains, result_dists_normalized)]
            for idx in range(len(resultInfo)):
                chain_classification_scores[
                    aa + idx, resultInfo[idx][0]] = resultInfo[idx][1]
        sorted_chains = np.zeros((test_feats.shape[0], unique_chains.shape[0]))
        for idx in range(test_feats.shape[0]):
            # print idx, ' out of ', test_feats.shape[0]
            sorted_chains[idx, :] = np.argsort(
                -chain_classification_scores[idx])
        correct_chain_to_unique_ind = unique_chains_sorted[np.searchsorted(
            unique_chains[unique_chains_sorted], test_class_to_chain)]
        top_k = np.zeros((test_feats.shape[0], unique_classes.shape[0]))
        for idx in range(test_feats.shape[0]):
            # print idx, ' out of ', test_feats.shape[0]
            topResult = np.where(sorted_chains[idx].astype('int') ==
                                 correct_chain_to_unique_ind[idx])[0][0]
            top_k[idx, topResult:] = 1
        average_accuracy = np.mean(top_k, axis=0)
        # save_h5('top_ims',top_ims,'i8',os.path.join(test_output_dir,'top_ims.h5'))
        save_h5('top_k', top_k, 'f', os.path.join(test_output_dir, 'top_k.h5'))
        save_h5('average_accuracy', average_accuracy, 'f',
                os.path.join(test_output_dir, 'average_accuracy.h5'))
        # print iterStr, test_name, average_accuracy[0], average_accuracy[2], average_accuracy[4], average_accuracy[9]
        print '%s, %s, %0.2f, %0.2f, %0.2f, %0.2f' % (
            iterStr, test_name, 100. * average_accuracy[0],
            100. * average_accuracy[2], 100. * average_accuracy[4],
            100. * average_accuracy[9])
Example #24
0
def evaluate_full(sess,
                  test_data,
                  model,
                  model_path,
                  batch_size,
                  item_cate_map,
                  save=True,
                  coef=None):
    topN = args.topN

    item_embs = model.output_item(sess)

    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.device = 0

    try:
        gpu_index = faiss.GpuIndexFlatIP(res, args.embedding_dim, flat_config)
        gpu_index.add(item_embs)
    except Exception as e:
        return {}

    total = 0
    total_recall = 0.0
    total_ndcg = 0.0
    total_hitrate = 0
    total_map = 0.0
    total_diversity = 0.0
    for src, tgt in test_data:
        nick_id, item_id, hist_item, hist_mask = prepare_data(src, tgt)

        user_embs = model.output_user(sess, [hist_item, hist_mask])

        if len(user_embs.shape) == 2:
            D, I = gpu_index.search(user_embs, topN)
            for i, iid_list in enumerate(item_id):
                recall = 0
                dcg = 0.0
                item_list = set(I[i])
                for no, iid in enumerate(iid_list):
                    if iid in item_list:
                        recall += 1
                        dcg += 1.0 / math.log(no + 2, 2)
                idcg = 0.0
                for no in range(recall):
                    idcg += 1.0 / math.log(no + 2, 2)
                total_recall += recall * 1.0 / len(iid_list)
                if recall > 0:
                    total_ndcg += dcg / idcg
                    total_hitrate += 1
                if not save:
                    total_diversity += compute_diversity(I[i], item_cate_map)
        else:
            ni = user_embs.shape[1]
            user_embs = np.reshape(user_embs, [-1, user_embs.shape[-1]])
            D, I = gpu_index.search(user_embs, topN)
            for i, iid_list in enumerate(item_id):
                recall = 0
                dcg = 0.0
                item_list_set = set()
                if coef is None:
                    item_list = list(
                        zip(np.reshape(I[i * ni:(i + 1) * ni], -1),
                            np.reshape(D[i * ni:(i + 1) * ni], -1)))
                    item_list.sort(key=lambda x: x[1], reverse=True)
                    for j in range(len(item_list)):
                        if item_list[j][0] not in item_list_set and item_list[
                                j][0] != 0:
                            item_list_set.add(item_list[j][0])
                            if len(item_list_set) >= topN:
                                break
                else:
                    origin_item_list = list(
                        zip(np.reshape(I[i * ni:(i + 1) * ni], -1),
                            np.reshape(D[i * ni:(i + 1) * ni], -1)))
                    origin_item_list.sort(key=lambda x: x[1], reverse=True)
                    item_list = []
                    tmp_item_set = set()
                    for (x, y) in origin_item_list:
                        if x not in tmp_item_set and x in item_cate_map:
                            item_list.append((x, y, item_cate_map[x]))
                            tmp_item_set.add(x)
                    cate_dict = defaultdict(int)
                    for j in range(topN):
                        max_index = 0
                        max_score = item_list[0][1] - coef * cate_dict[
                            item_list[0][2]]
                        for k in range(1, len(item_list)):
                            if item_list[k][1] - coef * cate_dict[
                                    item_list[k][2]] > max_score:
                                max_index = k
                                max_score = item_list[k][1] - coef * cate_dict[
                                    item_list[k][2]]
                            elif item_list[k][1] < max_score:
                                break
                        item_list_set.add(item_list[max_index][0])
                        cate_dict[item_list[max_index][2]] += 1
                        item_list.pop(max_index)

                for no, iid in enumerate(iid_list):
                    if iid in item_list_set:
                        recall += 1
                        dcg += 1.0 / math.log(no + 2, 2)
                idcg = 0.0
                for no in range(recall):
                    idcg += 1.0 / math.log(no + 2, 2)
                total_recall += recall * 1.0 / len(iid_list)
                if recall > 0:
                    total_ndcg += dcg / idcg
                    total_hitrate += 1
                if not save:
                    total_diversity += compute_diversity(
                        list(item_list_set), item_cate_map)

        total += len(item_id)

    recall = total_recall / total
    ndcg = total_ndcg / total
    hitrate = total_hitrate * 1.0 / total
    diversity = total_diversity * 1.0 / total

    if save:
        return {'recall': recall, 'ndcg': ndcg, 'hitrate': hitrate}
    return {
        'recall': recall,
        'ndcg': ndcg,
        'hitrate': hitrate,
        'diversity': diversity
    }
def kmeans(features, nclusters, num_iters, ngpu, njobs, seed):
    """
    Run k-means on features, generating nclusters clusters. It will use, in order of preference, Faiss, pomegranate, or
    scikit-learn.

    :param features: Features to cluster.
    :param nclusters: Number of clusters to generate.
    :param num_iters: Maximum number of iterations to perform.
    :param ngpu: Number of GPUs to use (if GPUs are available).
    :param njobs: Number of threads to use.
    :param seed: Seed for reproducibility.
    :return: centroids: The centroids found with k-means.
    """
    print('Running k-means...')
    if USE_FAISS:
        d = features.shape[1]
        pca_features = np.ascontiguousarray(features).astype('float32')

        clus = faiss.Clustering(d, nclusters)
        clus.verbose = True
        clus.niter = num_iters
        if seed is not None:
            clus.seed = seed

        # otherwise the kmeans implementation sub-samples the training set
        clus.max_points_per_centroid = 10000000

        if USE_GPU:
            res = [faiss.StandardGpuResources() for i in range(ngpu)]

            flat_config = []
            for i in range(ngpu):
                cfg = faiss.GpuIndexFlatConfig()
                cfg.useFloat16 = False
                cfg.device = i
                flat_config.append(cfg)

            if ngpu == 1:
                index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
            else:
                indexes = [
                    faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                    for i in range(ngpu)
                ]
                index = faiss.IndexProxy()
                for sub_index in indexes:
                    index.addIndex(sub_index)
        else:
            index = faiss.IndexFlatL2(d)

        clus.train(pca_features, index)
        centroids = faiss.vector_float_to_array(clus.centroids)
        centroids = centroids.reshape(nclusters, d)

    elif USE_POMEGRANATE and seed is None:
        kmeans = pomegranate.kmeans.Kmeans(nclusters,
                                           init='kmeans++',
                                           n_init=10)
        kmeans.fit(features, max_iterations=num_iters, n_jobs=njobs)
        centroids = kmeans.centroids
    else:
        if USE_POMEGRANATE and seed is not None:
            print(
                'Pomegranate does not currently support k-means with a seed. Switching to scikit-learn instead.'
            )
        print('Using scikit-learn. This may be slow!')
        kmeans = sklearn.cluster.KMeans(n_clusters=nclusters,
                                        random_state=seed).fit(features)
        centroids = kmeans.cluster_centers_

    return centroids
    def pool_kmean_init_gpu(self, seed=0, gpu_num=0, temperature=1):
        """TODO: clear up
        perform kmeans for cluster concept pool initialization
        Args:
            x: data to be clustered
        """

        print('performing kmeans clustering')
        results = {'im2cluster': [], 'centroids': [], 'density': []}
        x = self.concept_pool.clone().cpu().numpy().T
        x = np.ascontiguousarray(x)
        num_cluster = self.num_k
        # intialize faiss clustering parameters
        d = x.shape[1]
        k = int(num_cluster)
        clus = faiss.Clustering(d, k)
        clus.verbose = True
        clus.niter = 100
        clus.nredo = 10
        clus.seed = seed
        clus.max_points_per_centroid = 1000
        clus.min_points_per_centroid = 10

        res = faiss.StandardGpuResources()
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = gpu_num
        index = faiss.GpuIndexFlatL2(res, d, cfg)

        clus.train(x, index)

        D, I = index.search(
            x, 1)  # for each sample, find cluster distance and assignments
        im2cluster = [int(n[0]) for n in I]

        # get cluster centroids
        centroids = faiss.vector_to_array(clus.centroids).reshape(k, d)

        # sample-to-centroid distances for each cluster
        Dcluster = [[] for c in range(k)]
        for im, i in enumerate(im2cluster):
            Dcluster[i].append(D[im][0])

        # concentration estimation (phi)
        density = np.zeros(k)
        for i, dist in enumerate(Dcluster):
            if len(dist) > 1:
                d = (np.asarray(dist)**0.5).mean() / np.log(len(dist) + 10)
                density[i] = d

        #if cluster only has one point, use the max to estimate its concentration
        dmax = density.max()
        for i, dist in enumerate(Dcluster):
            if len(dist) <= 1:
                density[i] = dmax

        density = density.clip(np.percentile(density, 10),
                               np.percentile(
                                   density,
                                   90))  #clamp extreme values for stability
        print(density.mean())
        density = temperature * density / density.mean(
        )  #scale the mean to temperature

        # convert to cuda Tensors for broadcast
        centroids = torch.Tensor(centroids)
        centroids = nn.functional.normalize(centroids, p=2, dim=1)

        im2cluster = torch.LongTensor(im2cluster)
        density = torch.Tensor(density)

        results['centroids'].append(centroids)
        results['density'].append(density)
        results['im2cluster'].append(im2cluster)

        del cfg, res, index, clus

        # rearrange
        self.structure_memory_bank(results)
        print("Finish kmean init...")
        del results
    def synthesis(self, Z, X, W, init=True, dimMax=200):
        """
        Z: input texture (r,c,ch)
        X: output (r,c,ch)
        W: width of a patch
        """
        SynthInfo = {}
        step = W // 2

        Zr, Zc, ch = Z.shape
        Z_viewSize = (Zr - step * 2, Zc - step * 2, W, W, ch)
        Z_strides = Z.strides[:2] + Z.strides

        # Z から取りうるすべてのブロック (w*w*ch) を縦横に並べた五次元配列
        blocks = as_strided(Z, Z_viewSize, Z_strides)
        r, c = blocks.shape[:2]

        N = r * c
        p_dim = W * W * ch
        allBlockVecs = blocks.reshape(N, p_dim)

        self.pca = None
        if p_dim > dimMax:
            self.pca = PCA(n_components=dimMax)
            self.pca = self.pca.fit(allBlockVecs)
            DB = self.pca.transform(allBlockVecs)
            print('dim. reduction: {0} -> {1}'.format(p_dim, dimMax))
            print('explained cov. :', self.pca.explained_variance_ratio_.sum())
        else:
            DB = allBlockVecs

        SynthInfo['N'] = N
        SynthInfo['D'] = min(p_dim, dimMax)
        print('Search Space: N={0}, D={1} '.format(SynthInfo['N'],
                                                   SynthInfo['D']))

        res = faiss.StandardGpuResources()
        flat_config = faiss.GpuIndexFlatConfig()
        index = faiss.GpuIndexFlatL2(res, min(p_dim, dimMax), flat_config)
        index.add(DB.astype('float32'))
        print('index added')

        Xr, Xc = X.shape[:2]
        p_rowRange = np.arange(0, Xr - W, step)
        p_colRange = np.arange(0, Xc - W, step)
        row_p_num = len(p_rowRange)
        col_p_num = len(p_colRange)
        Q = row_p_num * col_p_num

        CoefMat = spsp.lil_matrix((Q * p_dim, Xr * Xc * ch))
        print('coefMat size: ({0},{1})'.format(Q * p_dim, Xr * Xc * ch))
        ix_mat = np.zeros((Q, p_dim), dtype=int)
        ix = np.arange(Xr * Xc * ch, dtype=int).reshape(Xr, Xc, ch)

        Q_ix = 0
        for (i, pos_y) in enumerate(p_rowRange):
            for (j, pos_x) in enumerate(p_colRange):
                # 出力画像をクエリに変換したい
                coef_col_ix = ix[pos_y:(pos_y + W),
                                 pos_x:(pos_x + W)].flatten()
                ix_mat[i * col_p_num + j, :] = coef_col_ix

                coef_row_ix = np.arange(Q_ix, Q_ix + p_dim)
                for (row_ix, col_ix) in zip(coef_row_ix, coef_col_ix):
                    CoefMat[row_ix, col_ix] = 1
                Q_ix += p_dim

        A = spsp.csr_matrix(CoefMat)
        SynthInfo['Q'] = Q
        print('query size: Q =', Q)

        Ix = np.zeros(Q, dtype=int)
        itr = 0
        fig = plt.figure(figsize=(15, 15))
        ims = []
        SynthInfo['iteration'] = []

        if init:
            Ix = np.random.randint(Q, size=Q)
            b = allBlockVecs[Ix].flatten()
            sol, istop, itn, norm = spsp.linalg.lsmr(A, b)[:4]
            X = sol.reshape(Xr, Xc, -1)
            itr += 1
            itrInfo = {
                "itr": itr,
                "log energy": norm,
                "lsmr istop": istop,
                "lsmr iter": itn
            }
            SynthInfo['iteration'].append(itrInfo)
            print(itrInfo)

        while True:
            im = plt.imshow(X[:, :, [2, 1, 0]].astype('int').clip(0, 255),
                            animated=True)
            ims.append([im])
            # Maximization: find nearest {z_p}
            Query = X.flatten()[ix_mat]
            if p_dim > dimMax:
                Query = self.pca.transform(Query)

            _D, Ix_next = index.search(Query.astype('float32'), 1)

            if np.all(Ix == Ix_next) | (itr > 100):
                break
            Ix = np.copy(Ix_next)

            # Expectation: update x
            b = allBlockVecs[Ix].flatten()
            sol, istop, itn, norm = spsp.linalg.lsmr(A, b)[:4]
            X = sol.reshape(Xr, Xc, -1)
            itr += 1
            itrInfo = {
                "itr": itr,
                "log energy": norm,
                "lsmr istop": istop,
                "lsmr iter": itn
            }
            SynthInfo['iteration'].append(itrInfo)
            print(itrInfo)

        fps = 8
        self.animation = animation.ArtistAnimation(fig,
                                                   ims,
                                                   interval=1000 // fps,
                                                   blit=True,
                                                   repeat_delay=1000)
        print('- synthesis converged')
        self.history.append(SynthInfo)

        return X
iterStr = 'ilsvrc2012'

output_dir = os.path.join('./output/ilsvrc2012/results',iterStr)

def save_h5(data_description,data,data_type,path):
    h5_feats=h5py.File(path,'w')
    h5_feats.create_dataset(data_description, data=data, dtype=data_type)
    h5_feats.close()

def load_h5(data_description,path):
    with h5py.File(path, 'r') as hf:
        data = hf[data_description][:]
    return data

res = faiss.StandardGpuResources()
flat_config = faiss.GpuIndexFlatConfig()
flat_config.device = whichGPU

train_feats = load_h5('train_feats',os.path.join(output_dir,'trainFeats.h5'))
train_classes = load_h5('train_classes',os.path.join(output_dir,'trainClasses.h5'))
train_ims = load_h5('train_ims',os.path.join(output_dir,'trainIms.h5'))
gpu_index = faiss.GpuIndexFlatIP(res, train_feats.shape[1],flat_config)
for feat in train_feats:
    gpu_index.add(np.expand_dims(feat,0))

test_datasets = ['./input/test_by_hotel.txt','./input/occluded_test/by_hotel/0.txt','./input/occluded_test/by_hotel/1.txt','./input/occluded_test/by_hotel/2.txt','./input/occluded_test/by_hotel/3.txt']
test_names = ['by_hotel','occluded0','occluded1','occluded2','occluded3']
for test_dataset, test_name in zip(test_datasets,test_names):
    test_output_dir = os.path.join(output_dir,test_name)
    if not os.path.exists(os.path.join(test_output_dir,'top_k.h5')):
        test_feats = load_h5('test_feats',os.path.join(test_output_dir,'testFeats.h5'))
Example #29
0
def run_kmeans(x, args):
    """
    Args:
        x: data to be clustered
    """

    print('performing kmeans clustering')
    results = {'im2cluster': [], 'centroids': [], 'density': []}

    for seed, num_cluster in enumerate(args.num_cluster):
        # intialize faiss clustering parameters
        d = x.shape[1]
        k = int(num_cluster)
        clus = faiss.Clustering(d, k)
        clus.verbose = True
        clus.niter = 20
        clus.nredo = 5
        clus.seed = seed
        clus.max_points_per_centroid = 1000
        clus.min_points_per_centroid = 10

        res = faiss.StandardGpuResources()
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = args.gpu
        index = faiss.GpuIndexFlatL2(res, d, cfg)

        clus.train(x, index)

        D, I = index.search(x, 1)  # for each sample, find cluster distance and assignments
        im2cluster = [int(n[0]) for n in I]

        # get cluster centroids
        centroids = faiss.vector_to_array(clus.centroids).reshape(k, d)

        # sample-to-centroid distances for each cluster
        Dcluster = [[] for c in range(k)]
        for im, i in enumerate(im2cluster):
            Dcluster[i].append(D[im][0])

        # concentration estimation (phi)
        density = np.zeros(k)
        for i, dist in enumerate(Dcluster):
            if len(dist) > 1:
                d = (np.asarray(dist) ** 0.5).mean() / np.log(len(dist) + 10)
                density[i] = d

                # if cluster only has one point, use the max to estimate its concentration
        dmax = density.max()
        for i, dist in enumerate(Dcluster):
            if len(dist) <= 1:
                density[i] = dmax

        density = density.clip(np.percentile(density, 10),
                               np.percentile(density, 90))  # clamp extreme values for stability
        density = args.temperature * density / density.mean()  # scale the mean to temperature

        # convert to cuda Tensors for broadcast
        centroids = torch.Tensor(centroids).cuda()
        centroids = nn.functional.normalize(centroids, p=2, dim=1)

        im2cluster = torch.LongTensor(im2cluster).cuda()
        density = torch.Tensor(density).cuda()

        results['centroids'].append(centroids)
        results['density'].append(density)
        results['im2cluster'].append(im2cluster)

    return results
Example #30
0
def main():
    global args
    args = parser.parse_args()

    # fix random seeds
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)

    ######################################################################

    print('unpickle clustering objects...')
    handle = open(os.path.join(args.exp, "features.obj"), "rb")
    features = pickle.load(handle)
    handle.close()

    handle = open(os.path.join(args.exp, "train_dataset.obj"), "rb")
    train_dataset = pickle.load(handle)
    handle.close()

    handle = open(os.path.join(args.exp, "images_lists.obj"), "rb")
    images_lists = pickle.load(handle)
    handle.close()

    handle = open(os.path.join(args.exp, "dataset_imgs.obj"), "rb")
    dataset_imgs = pickle.load(handle)
    handle.close()

    print('num clusters: %d' % len(images_lists))

    #####################################################
    # calculate 10-NN for each feature of 1st cluster
    feature_ids_cluster_0 = images_lists[0]
    # print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
    # print('cluster_0: %s' % str(feature_ids_cluster_0))
    # assert len(features) == len(dataset_imgs)
    # for i in feature_ids_cluster_0:
    #     print(i, '---', np.linalg.norm(features[i]), '---', dataset_imgs[i])
    # print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')

    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    d = features.shape[1]  # dimension
    print('features.shape = %s' % (str(features.shape)))
    print('dimension: %d' % d)
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    features_cluster_0 = np.zeros((len(feature_ids_cluster_0), features.shape[1])).astype('float32')
    for j, i in enumerate(feature_ids_cluster_0):
        print(j, '-', i)
        features_cluster_0[j] = features[i]

    print('features_cluster_0.shape = %s' % str(features_cluster_0.shape))
    index.add(features_cluster_0)

    k = args.knn
    print('searching for %d-NN for each feature in cluster...' % k)
    D, I = index.search(features_cluster_0[:1], k + 1)
    print('results I: ')
    print(I)
    print('results D: ')
    print(D)
    print('%d NN images for 1st feature %s: ' % (k, str(dataset_imgs[feature_ids_cluster_0[0]])))
    for i in range(k+1):
        print('index into cluster_0: %d' % I[0][i])
        id_into_dataset = feature_ids_cluster_0[I[0][i]]
        print('index into dataset_imgs: %d' % id_into_dataset)
        print(dataset_imgs[id_into_dataset])