Esempio n. 1
0
 def test_write_580M(self):
     dim = 8
     nhash = 1
     num_million = 580 # changing to 570 works
     index1 = faiss.IndexBinaryMultiHash(dim, nhash, int(dim/nhash))
     random_hash_codes = np.random.randint(0, 256, (
         num_million * int(1e6), int(dim/8))).astype("uint8")
     index1.add(random_hash_codes)
     faiss.write_index_binary(index1, "/tmp/tmp.faiss")
     index2 = faiss.read_index_binary("/tmp/tmp.faiss")
Esempio n. 2
0
    def test_hash_and_multihash(self):
        d = 128
        nq = 100
        nb = 2000

        (_, xb, xq) = make_binary_dataset(d, 0, nb, nq)

        index_ref = faiss.IndexBinaryFlat(d)
        index_ref.add(xb)
        k = 10
        Dref, Iref = index_ref.search(xq, k)

        nfound = {}
        for nh in 0, 1, 3, 5:

            for nbit in 4, 7:
                if nh == 0:
                    index = faiss.IndexBinaryHash(d, nbit)
                else:
                    index = faiss.IndexBinaryMultiHash(d, nh, nbit)
                index.add(xb)
                index.nflip = 2
                Dnew, Inew = index.search(xq, k)
                nf = 0
                for i in range(nq):
                    ref = Iref[i]
                    new = Inew[i]
                    snew = set(new)
                    # no duplicates
                    self.assertTrue(len(new) == len(snew))
                    nf += len(set(ref) & snew)
                print('nfound', nh, nbit, nf)
                nfound[(nh, nbit)] = nf
            self.assertGreater(nfound[(nh, 4)], nfound[(nh, 7)])

            # test serialization
            index2 = faiss.deserialize_index_binary(
                faiss.serialize_index_binary(index))

            D2, I2 = index2.search(xq, k)
            np.testing.assert_array_equal(Inew, I2)
            np.testing.assert_array_equal(Dnew, D2)

        print('nfound=', nfound)
        self.assertGreater(3, abs(nfound[(0, 7)] - nfound[(1, 7)]))
        self.assertGreater(nfound[(3, 7)], nfound[(1, 7)])
        self.assertGreater(nfound[(5, 7)], nfound[(3, 7)])
Esempio n. 3
0
    def subtest_result_order(self, nh):

        d = 128
        nq = 10
        nb = 200

        (_, xb, xq) = make_binary_dataset(d, 0, nb, nq)

        nbit = 10
        if nh == 0:
            index = faiss.IndexBinaryHash(d, nbit)
        else:
            index = faiss.IndexBinaryMultiHash(d, nh, nbit)
        index.add(xb)
        index.nflip = 5
        k = 10
        Do, Io = index.search(xq, k)
        self.assertTrue(np.all(Do[:, 1:] >= Do[:, :-1]))
Esempio n. 4
0
    def test_multihash(self):
        d = 128
        nq = 100
        nb = 2000

        (_, xb, xq) = make_binary_dataset(d, 0, nb, nq)

        index_ref = faiss.IndexBinaryFlat(d)
        index_ref.add(xb)

        radius = 55

        Lref, Dref, Iref = index_ref.range_search(xq, radius)

        print("nb res: ", Lref[-1])

        nfound = []
        ndis = []

        for nh in 1, 3, 5:
            index = faiss.IndexBinaryMultiHash(d, nh, 10)
            index.add(xb)
            # index.display()
            stats = faiss.cvar.indexBinaryHash_stats
            index.nflip = 2
            stats.reset()
            Lnew, Dnew, Inew = index.range_search(xq, radius)
            for i in range(nq):
                ref = Iref[Lref[i]:Lref[i + 1]]
                new = Inew[Lnew[i]:Lnew[i + 1]]
                snew = set(new)
                # no duplicates
                self.assertTrue(len(new) == len(snew))
                # subset of real results
                self.assertTrue(snew <= set(ref))
            nfound.append(Lnew[-1])
            ndis.append(stats.ndis)
        print('nfound=', nfound)
        print('ndis=', ndis)
        nfound = np.array(nfound)
        # self.assertTrue(nfound[-1] == Lref[-1])
        self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
    def create(
        hashes: t.Iterable[PDQ_HASH_TYPE],
        nhash: int = 16,
        custom_ids: t.Iterable[int] = None,
    ) -> "PDQMultiHashIndex":
        """
        Creates a PDQMultiHashIndex for use searching against the provided hashes.

        Parameters
        ----------
        hashes: sequence of PDQ Hashes
            The PDQ hashes to create the index with
        nhash: int (optional)
            Optional number of hashmaps for the underlaying faiss index to use for
            the Multi-Index Hashing lookups.
        custom_ids: sequence of custom ids for the PDQ Hashes (optional)
            Optional sequence of custom id values to use for the PDQ hashes for any
            method relating to indexes (e.g., hash_at). If provided, the nth item in
            custom_ids will be used as the id for the nth hash in hashes. If not provided
            then the ids for the hashes will be assumed to be their respective index
            in hashes (i.e., the nth hash would have id n, starting from 0).

        Returns
        -------
        a PDQMultiHashIndex of these hashes
        """
        hash_bytes = [binascii.unhexlify(hash) for hash in hashes]
        vectors = list(
            map(lambda h: numpy.frombuffer(h, dtype=numpy.uint8), hash_bytes)
        )
        bits_per_hashmap = BITS_IN_PDQ // nhash
        index = faiss.IndexBinaryMultiHash(BITS_IN_PDQ, nhash, bits_per_hashmap)
        if vectors:
            if custom_ids != None:
                index = faiss.IndexBinaryIDMap2(index)
                i64_ids = list(map(uint64_to_int64, custom_ids))

                index.add_with_ids(numpy.array(vectors), numpy.array(i64_ids))
            else:
                index.add(numpy.array(vectors))
        return PDQMultiHashIndex(index)
 def __init__(self, nhash: int = 16):
     bits_per_hashmap = BITS_IN_PDQ // nhash
     faiss_index = faiss.IndexBinaryIDMap2(
         faiss.IndexBinaryMultiHash(BITS_IN_PDQ, nhash, bits_per_hashmap))
     super().__init__(faiss_index)
     self.__construct_index_rev_map()