def test_normalized(self): rs = np.random.RandomState(123) m = rs.rand(40, 20).astype('float32') faiss.normalize_L2(m) comments = faiss.MatrixStats(m).comments print(comments) assert 'vectors are normalized' in comments
def test_dead_dims(self): rs = np.random.RandomState(123) m = rs.rand(40, 20).astype('float32') m[:, 5:10] = 0 comments = faiss.MatrixStats(m).comments print(comments) assert '5 dimensions are constant' in comments
def test_rogue_means(self): rs = np.random.RandomState(123) m = rs.rand(40, 20).astype('float32') m[:, 5:10] += 12345 comments = faiss.MatrixStats(m).comments print(comments) assert '5 dimensions are too large wrt. their variance' in comments
def test_copies(self): rs = np.random.RandomState(123) m = rs.rand(40, 20).astype('float32') m[::2] = m[1::2] comments = faiss.MatrixStats(m).comments print(comments) assert '20 vectors are distinct' in comments
def index(self): if self.indexed: return self.means = np.array(self.vecs).astype("float32") del self.vecs self.means = np.ascontiguousarray(self.means) self.means /= np.sqrt((self.means**2).sum(axis=1))[:, None] self.ci_vi = {ci: vi for vi, (_, ci) in enumerate(self.vi_tici)} if self.storage: meansdf = (pd.DataFrame(self.means).reset_index().melt( id_vars=["index"], var_name="dim", value_name="val")) self.storage.save_df(meansdf, "means") self.storage.save_pickle(self.vi_tici, "vi_tici") self.close() else: # Create FAISS index faissindex = faiss.IndexFlatIP( self.means.shape[1]) # build the index # add vectors to the index faissindex.add(np.array(self.means)) # type: ignore log.debug("faiss info: %s", faiss.MatrixStats(self.means).comments) self.faissindex = faissindex self.indexed = True
def test_0s(self): rs = np.random.RandomState(123) m = rs.rand(40, 20).astype('float32') m[5:10] = 0 comments = faiss.MatrixStats(m).comments print(comments) assert 'has 5 copies' in comments assert '5 null vectors' in comments
def random_sample(self, index, num_samples, verbose=True): sample_ids = np.random.permutation(index.ntotal)[:num_samples] sample = [] for sid in sample_ids: sample.append(index.reconstruct(int(sid))) sample = np.concatenate(sample) if verbose: print(faiss.MatrixStats(sample).comments) return sample
def prepare_block(self, tableid_colids: Dict[int, Set[int]]): if self.storage: # Create FAISS index faissindex = faiss.IndexFlatIP( self.means.shape[1]) # build the index # add vectors to the index faissindex.add(np.array(self.means)) # type: ignore log.debug("faiss info: %s", faiss.MatrixStats(self.means).comments) else: faissindex = self.faissindex ci_ti = {ci: ti for ti, cs in tableid_colids.items() for ci in cs} qi_mean, ci_qi = [], {} # type: ignore for ci in ci_ti: if ci in self.ci_vi: ci_qi[ci] = len(qi_mean) qi_mean.append(self.means[self.ci_vi[ci]]) if not len(qi_mean): log.error( f"No column embeddings found in {self.name} for any of tables" f"{list(tableid_colids)}") else: xq = np.vstack(qi_mean) # query vectors xq /= np.sqrt((xq**2).sum(axis=1))[:, None] # L2 normalize xq = xq.astype("float32") log.debug( f"Querying {self.name} faiss index with query matrix of shape {xq.shape}" ) D, I = faissindex.search(xq, self.topn) for ci1, qi in ci_qi.items(): indexes, similarities = I[qi], np.maximum(D[qi], 0) ti1 = ci_ti[ci1] for qi2 in set(indexes[(similarities > self.threshold)]) - set( [ci1]): ti2, _ = self.vi_tici[qi2] if ti2 != ti1: self.ti_block.setdefault(ti1, set()).add(ti2)