Exemple #1
0
    def test_pca_epsilon(self):
        d = 64
        n = 1000
        np.random.seed(123)
        x = np.random.random(size=(n, d)).astype('float32')

        # make sure data is in a sub-space
        x[:, ::2] = 0

        # check division by 0 with default computation
        pca = faiss.PCAMatrix(d, 60, -0.5)
        pca.train(x)
        y = pca.apply(x)
        self.assertFalse(np.all(np.isfinite(y)))

        # check add epsilon
        pca = faiss.PCAMatrix(d, 60, -0.5)
        pca.epsilon = 1e-5
        pca.train(x)
        y = pca.apply(x)
        self.assertTrue(np.all(np.isfinite(y)))

        # check I/O
        index = faiss.index_factory(d, "PCAW60,Flat")
        index = faiss.deserialize_index(faiss.serialize_index(index))
        pca1 = faiss.downcast_VectorTransform(index.chain.at(0))
        pca1.epsilon = 1e-5
        index.train(x)
        pca = faiss.downcast_VectorTransform(index.chain.at(0))
        y = pca.apply(x)
        self.assertTrue(np.all(np.isfinite(y)))
Exemple #2
0
    def test_chain(self):

        # generate data
        d = 4
        nt = 1000
        nb = 200
        nq = 200

        # normal distribition
        x = faiss.randn((nt + nb + nq) * d, 1234).reshape(nt + nb + nq, d)

        # make distribution very skewed
        x *= [10, 4, 1, 0.5]
        rr, _ = np.linalg.qr(faiss.randn(d * d).reshape(d, d))
        x = np.dot(x, rr).astype('float32')

        xt = x[:nt]
        xb = x[nt:-nq]
        xq = x[-nq:]

        index = faiss.index_factory(d, "L2norm,PCA2,L2norm,Flat")

        assert index.chain.size() == 3
        l2_1 = faiss.downcast_VectorTransform(index.chain.at(0))
        assert l2_1.norm == 2
        pca = faiss.downcast_VectorTransform(index.chain.at(1))
        assert not pca.is_trained
        index.train(xt)
        assert pca.is_trained

        index.add(xb)
        D, I = index.search(xq, 5)

        # do the computation manually and check if we get the same result
        def manual_trans(x):
            x = x.copy()
            faiss.normalize_L2(x)
            x = pca.apply_py(x)
            faiss.normalize_L2(x)
            return x

        index2 = faiss.IndexFlatL2(2)
        index2.add(manual_trans(xb))
        D2, I2 = index2.search(manual_trans(xq), 5)

        assert np.all(I == I2)
    def test_chain(self):

        # generate data
        d = 4
        nt = 1000
        nb = 200
        nq = 200

        # normal distribition
        x = faiss.randn((nt + nb + nq) * d, 1234).reshape(nt + nb + nq, d)

        # make distribution very skewed
        x *= [10, 4, 1, 0.5]
        rr, _ = np.linalg.qr(faiss.randn(d * d).reshape(d, d))
        x = np.dot(x, rr).astype('float32')

        xt = x[:nt]
        xb = x[nt:-nq]
        xq = x[-nq:]

        index = faiss.index_factory(d, "L2norm,PCA2,L2norm,Flat")

        assert index.chain.size() == 3
        l2_1 = faiss.downcast_VectorTransform(index.chain.at(0))
        assert l2_1.norm == 2
        pca = faiss.downcast_VectorTransform(index.chain.at(1))
        assert not pca.is_trained
        index.train(xt)
        assert pca.is_trained

        index.add(xb)
        D, I = index.search(xq, 5)

        # do the computation manually and check if we get the same result
        def manual_trans(x):
            x = x.copy()
            faiss.normalize_L2(x)
            x = pca.apply_py(x)
            faiss.normalize_L2(x)
            return x

        index2 = faiss.IndexFlatL2(2)
        index2.add(manual_trans(xb))
        D2, I2 = index2.search(manual_trans(xq), 5)

        assert np.all(I == I2)
Exemple #4
0
    def test_itq_transform(self):
        codec = faiss.index_factory(16, "ITQ8,LSHt")

        itqt = faiss.downcast_VectorTransform(codec.chain.at(0))
        itqt.pca_then_itq
Exemple #5
0
        vec_transform = lambda x: x

else:

    print("build index, key=", args.indexkey)

    index = faiss.index_factory(
        d, args.indexkey, faiss.METRIC_L2 if ds.metric == "L2" else
        faiss.METRIC_INNER_PRODUCT
    )

    index_ivf, vec_transform = unwind_index_ivf(index)
    if vec_transform is None:
        vec_transform = lambda x: x
    else:
        vec_transform = faiss.downcast_VectorTransform(vec_transform)

    if args.by_residual != -1:
        by_residual = args.by_residual == 1
        print("setting by_residual = ", by_residual)
        index_ivf.by_residual   # check if field exists
        index_ivf.by_residual = by_residual

    if index_ivf:
        print("Update add-time parameters")
        # adjust default parameters used at add time for quantizers
        # because otherwise the assignment is inaccurate
        quantizer = faiss.downcast_index(index_ivf.quantizer)
        if isinstance(quantizer, faiss.IndexRefine):
            print("   update quantizer k_factor=", quantizer.k_factor, end=" -> ")
            quantizer.k_factor = 32 if index_ivf.nlist < 1e6 else 64
Exemple #6
0
    def __init__(self,
                 phrase_dump_dir,
                 index_path,
                 idx2id_path,
                 cuda=False,
                 logging_level=logging.INFO):
        self.phrase_dump_dir = phrase_dump_dir

        # Read index
        self.index = {}
        logger.info(
            f'Reading {index_path} - could take up to 15 mins depending on the file reading speed of HDD/SSD'
        )
        self.index = faiss.read_index(index_path,
                                      faiss.IO_FLAG_ONDISK_SAME_DIR)
        self.reconst_fn = faiss.downcast_index(self.index.index).reconstruct
        self.R = torch.FloatTensor(
            faiss.vector_to_array(
                faiss.downcast_VectorTransform(
                    self.index.chain.at(0)).A).reshape(self.index.d,
                                                       self.index.d))
        self.max_idx = 1e8 if 'PQ' not in index_path else 1e9
        logger.info(
            f'index ntotal: {self.index.ntotal} | PQ: {"PQ" in index_path}')

        # Read idx2id
        self.idx_f = {}
        logger.info('Load idx2id on memory')
        self.idx_f = self.load_idx_f(idx2id_path)
        self.offset = None
        self.scale = None
        self.doc_groups = None

        # Options
        logger.setLevel(logging_level)
        self.num_docs_list = []
        self.cuda = cuda
        if self.cuda:
            assert torch.cuda.is_available(
            ), f"Cuda availability {torch.cuda.is_available()}"
            self.device = torch.device('cuda')
            logger.info("Load IVF on GPU")
            index_ivf = faiss.extract_index_ivf(self.index)
            index_ivf.nprobe = 256
            quantizer = index_ivf.quantizer
            quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer)
            index_ivf.quantizer = quantizer_gpu
            self.R = self.R.to(self.device)
            logger.info(f"N probe: {index_ivf.nprobe}")
        else:
            self.device = torch.device("cpu")
            index_ivf = faiss.extract_index_ivf(self.index)
            index_ivf.nprobe = 256

        # For sentence split
        self.sentencizer = English()
        self.sentencizer.add_pipe(self.sentencizer.create_pipe('sentencizer'))

        # Load metadata on RAM if possible
        doc_group_path = os.path.join(
            self.phrase_dump_dir[:self.phrase_dump_dir.index('/phrase')],
            'meta_compressed.pkl')
        if os.path.exists(doc_group_path) and ('PQ' in index_path):
            logger.info(
                f"Loading metadata on RAM from {doc_group_path} (for PQ only)")
            self.doc_groups = pickle.load(open(doc_group_path, 'rb'))
        else:
            logger.info(
                f"Will read metadata directly from hdf5 files (requires SSDs for faster inference)"
            )