Exemple #1
0
 def test_normalized(self):
     rs = np.random.RandomState(123)
     m = rs.rand(40, 20).astype('float32')
     faiss.normalize_L2(m)
     comments = faiss.MatrixStats(m).comments
     print(comments)
     assert 'vectors are normalized' in comments
Exemple #2
0
 def test_dead_dims(self):
     rs = np.random.RandomState(123)
     m = rs.rand(40, 20).astype('float32')
     m[:, 5:10] = 0
     comments = faiss.MatrixStats(m).comments
     print(comments)
     assert '5 dimensions are constant' in comments
Exemple #3
0
 def test_rogue_means(self):
     rs = np.random.RandomState(123)
     m = rs.rand(40, 20).astype('float32')
     m[:, 5:10] += 12345
     comments = faiss.MatrixStats(m).comments
     print(comments)
     assert '5 dimensions are too large wrt. their variance' in comments
Exemple #4
0
 def test_copies(self):
     rs = np.random.RandomState(123)
     m = rs.rand(40, 20).astype('float32')
     m[::2] = m[1::2]
     comments = faiss.MatrixStats(m).comments
     print(comments)
     assert '20 vectors are distinct' in comments
Exemple #5
0
    def index(self):
        if self.indexed:
            return

        self.means = np.array(self.vecs).astype("float32")
        del self.vecs
        self.means = np.ascontiguousarray(self.means)
        self.means /= np.sqrt((self.means**2).sum(axis=1))[:, None]

        self.ci_vi = {ci: vi for vi, (_, ci) in enumerate(self.vi_tici)}

        if self.storage:
            meansdf = (pd.DataFrame(self.means).reset_index().melt(
                id_vars=["index"], var_name="dim", value_name="val"))
            self.storage.save_df(meansdf, "means")
            self.storage.save_pickle(self.vi_tici, "vi_tici")
            self.close()
        else:
            # Create FAISS index
            faissindex = faiss.IndexFlatIP(
                self.means.shape[1])  # build the index
            # add vectors to the index
            faissindex.add(np.array(self.means))  # type: ignore
            log.debug("faiss info: %s", faiss.MatrixStats(self.means).comments)
            self.faissindex = faissindex

        self.indexed = True
Exemple #6
0
 def test_0s(self):
     rs = np.random.RandomState(123)
     m = rs.rand(40, 20).astype('float32')
     m[5:10] = 0
     comments = faiss.MatrixStats(m).comments
     print(comments)
     assert 'has 5 copies' in comments
     assert '5 null vectors' in comments
Exemple #7
0
    def random_sample(self, index, num_samples, verbose=True):
        sample_ids = np.random.permutation(index.ntotal)[:num_samples]

        sample = []
        for sid in sample_ids:
            sample.append(index.reconstruct(int(sid)))
        sample = np.concatenate(sample)

        if verbose:
            print(faiss.MatrixStats(sample).comments)

        return sample
Exemple #8
0
    def prepare_block(self, tableid_colids: Dict[int, Set[int]]):
        if self.storage:
            # Create FAISS index
            faissindex = faiss.IndexFlatIP(
                self.means.shape[1])  # build the index
            # add vectors to the index
            faissindex.add(np.array(self.means))  # type: ignore
            log.debug("faiss info: %s", faiss.MatrixStats(self.means).comments)
        else:
            faissindex = self.faissindex

        ci_ti = {ci: ti for ti, cs in tableid_colids.items() for ci in cs}
        qi_mean, ci_qi = [], {}  # type: ignore
        for ci in ci_ti:
            if ci in self.ci_vi:
                ci_qi[ci] = len(qi_mean)
                qi_mean.append(self.means[self.ci_vi[ci]])

        if not len(qi_mean):
            log.error(
                f"No column embeddings found in {self.name} for any of tables"
                f"{list(tableid_colids)}")
        else:
            xq = np.vstack(qi_mean)  # query vectors
            xq /= np.sqrt((xq**2).sum(axis=1))[:, None]  # L2 normalize
            xq = xq.astype("float32")
            log.debug(
                f"Querying {self.name} faiss index with query matrix of shape {xq.shape}"
            )
            D, I = faissindex.search(xq, self.topn)

            for ci1, qi in ci_qi.items():
                indexes, similarities = I[qi], np.maximum(D[qi], 0)
                ti1 = ci_ti[ci1]
                for qi2 in set(indexes[(similarities > self.threshold)]) - set(
                    [ci1]):
                    ti2, _ = self.vi_tici[qi2]
                    if ti2 != ti1:
                        self.ti_block.setdefault(ti1, set()).add(ti2)