def test_pca_epsilon(self): d = 64 n = 1000 np.random.seed(123) x = np.random.random(size=(n, d)).astype('float32') # make sure data is in a sub-space x[:, ::2] = 0 # check division by 0 with default computation pca = faiss.PCAMatrix(d, 60, -0.5) pca.train(x) y = pca.apply(x) self.assertFalse(np.all(np.isfinite(y))) # check add epsilon pca = faiss.PCAMatrix(d, 60, -0.5) pca.epsilon = 1e-5 pca.train(x) y = pca.apply(x) self.assertTrue(np.all(np.isfinite(y))) # check I/O index = faiss.index_factory(d, "PCAW60,Flat") index = faiss.deserialize_index(faiss.serialize_index(index)) pca1 = faiss.downcast_VectorTransform(index.chain.at(0)) pca1.epsilon = 1e-5 index.train(x) pca = faiss.downcast_VectorTransform(index.chain.at(0)) y = pca.apply(x) self.assertTrue(np.all(np.isfinite(y)))
def test_chain(self): # generate data d = 4 nt = 1000 nb = 200 nq = 200 # normal distribition x = faiss.randn((nt + nb + nq) * d, 1234).reshape(nt + nb + nq, d) # make distribution very skewed x *= [10, 4, 1, 0.5] rr, _ = np.linalg.qr(faiss.randn(d * d).reshape(d, d)) x = np.dot(x, rr).astype('float32') xt = x[:nt] xb = x[nt:-nq] xq = x[-nq:] index = faiss.index_factory(d, "L2norm,PCA2,L2norm,Flat") assert index.chain.size() == 3 l2_1 = faiss.downcast_VectorTransform(index.chain.at(0)) assert l2_1.norm == 2 pca = faiss.downcast_VectorTransform(index.chain.at(1)) assert not pca.is_trained index.train(xt) assert pca.is_trained index.add(xb) D, I = index.search(xq, 5) # do the computation manually and check if we get the same result def manual_trans(x): x = x.copy() faiss.normalize_L2(x) x = pca.apply_py(x) faiss.normalize_L2(x) return x index2 = faiss.IndexFlatL2(2) index2.add(manual_trans(xb)) D2, I2 = index2.search(manual_trans(xq), 5) assert np.all(I == I2)
def test_chain(self): # generate data d = 4 nt = 1000 nb = 200 nq = 200 # normal distribition x = faiss.randn((nt + nb + nq) * d, 1234).reshape(nt + nb + nq, d) # make distribution very skewed x *= [10, 4, 1, 0.5] rr, _ = np.linalg.qr(faiss.randn(d * d).reshape(d, d)) x = np.dot(x, rr).astype('float32') xt = x[:nt] xb = x[nt:-nq] xq = x[-nq:] index = faiss.index_factory(d, "L2norm,PCA2,L2norm,Flat") assert index.chain.size() == 3 l2_1 = faiss.downcast_VectorTransform(index.chain.at(0)) assert l2_1.norm == 2 pca = faiss.downcast_VectorTransform(index.chain.at(1)) assert not pca.is_trained index.train(xt) assert pca.is_trained index.add(xb) D, I = index.search(xq, 5) # do the computation manually and check if we get the same result def manual_trans(x): x = x.copy() faiss.normalize_L2(x) x = pca.apply_py(x) faiss.normalize_L2(x) return x index2 = faiss.IndexFlatL2(2) index2.add(manual_trans(xb)) D2, I2 = index2.search(manual_trans(xq), 5) assert np.all(I == I2)
def test_itq_transform(self): codec = faiss.index_factory(16, "ITQ8,LSHt") itqt = faiss.downcast_VectorTransform(codec.chain.at(0)) itqt.pca_then_itq
vec_transform = lambda x: x else: print("build index, key=", args.indexkey) index = faiss.index_factory( d, args.indexkey, faiss.METRIC_L2 if ds.metric == "L2" else faiss.METRIC_INNER_PRODUCT ) index_ivf, vec_transform = unwind_index_ivf(index) if vec_transform is None: vec_transform = lambda x: x else: vec_transform = faiss.downcast_VectorTransform(vec_transform) if args.by_residual != -1: by_residual = args.by_residual == 1 print("setting by_residual = ", by_residual) index_ivf.by_residual # check if field exists index_ivf.by_residual = by_residual if index_ivf: print("Update add-time parameters") # adjust default parameters used at add time for quantizers # because otherwise the assignment is inaccurate quantizer = faiss.downcast_index(index_ivf.quantizer) if isinstance(quantizer, faiss.IndexRefine): print(" update quantizer k_factor=", quantizer.k_factor, end=" -> ") quantizer.k_factor = 32 if index_ivf.nlist < 1e6 else 64
def __init__(self, phrase_dump_dir, index_path, idx2id_path, cuda=False, logging_level=logging.INFO): self.phrase_dump_dir = phrase_dump_dir # Read index self.index = {} logger.info( f'Reading {index_path} - could take up to 15 mins depending on the file reading speed of HDD/SSD' ) self.index = faiss.read_index(index_path, faiss.IO_FLAG_ONDISK_SAME_DIR) self.reconst_fn = faiss.downcast_index(self.index.index).reconstruct self.R = torch.FloatTensor( faiss.vector_to_array( faiss.downcast_VectorTransform( self.index.chain.at(0)).A).reshape(self.index.d, self.index.d)) self.max_idx = 1e8 if 'PQ' not in index_path else 1e9 logger.info( f'index ntotal: {self.index.ntotal} | PQ: {"PQ" in index_path}') # Read idx2id self.idx_f = {} logger.info('Load idx2id on memory') self.idx_f = self.load_idx_f(idx2id_path) self.offset = None self.scale = None self.doc_groups = None # Options logger.setLevel(logging_level) self.num_docs_list = [] self.cuda = cuda if self.cuda: assert torch.cuda.is_available( ), f"Cuda availability {torch.cuda.is_available()}" self.device = torch.device('cuda') logger.info("Load IVF on GPU") index_ivf = faiss.extract_index_ivf(self.index) index_ivf.nprobe = 256 quantizer = index_ivf.quantizer quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer) index_ivf.quantizer = quantizer_gpu self.R = self.R.to(self.device) logger.info(f"N probe: {index_ivf.nprobe}") else: self.device = torch.device("cpu") index_ivf = faiss.extract_index_ivf(self.index) index_ivf.nprobe = 256 # For sentence split self.sentencizer = English() self.sentencizer.add_pipe(self.sentencizer.create_pipe('sentencizer')) # Load metadata on RAM if possible doc_group_path = os.path.join( self.phrase_dump_dir[:self.phrase_dump_dir.index('/phrase')], 'meta_compressed.pkl') if os.path.exists(doc_group_path) and ('PQ' in index_path): logger.info( f"Loading metadata on RAM from {doc_group_path} (for PQ only)") self.doc_groups = pickle.load(open(doc_group_path, 'rb')) else: logger.info( f"Will read metadata directly from hdf5 files (requires SSDs for faster inference)" )