def test_write_580M(self): dim = 8 nhash = 1 num_million = 580 # changing to 570 works index1 = faiss.IndexBinaryMultiHash(dim, nhash, int(dim/nhash)) random_hash_codes = np.random.randint(0, 256, ( num_million * int(1e6), int(dim/8))).astype("uint8") index1.add(random_hash_codes) faiss.write_index_binary(index1, "/tmp/tmp.faiss") index2 = faiss.read_index_binary("/tmp/tmp.faiss")
def test_hash_and_multihash(self): d = 128 nq = 100 nb = 2000 (_, xb, xq) = make_binary_dataset(d, 0, nb, nq) index_ref = faiss.IndexBinaryFlat(d) index_ref.add(xb) k = 10 Dref, Iref = index_ref.search(xq, k) nfound = {} for nh in 0, 1, 3, 5: for nbit in 4, 7: if nh == 0: index = faiss.IndexBinaryHash(d, nbit) else: index = faiss.IndexBinaryMultiHash(d, nh, nbit) index.add(xb) index.nflip = 2 Dnew, Inew = index.search(xq, k) nf = 0 for i in range(nq): ref = Iref[i] new = Inew[i] snew = set(new) # no duplicates self.assertTrue(len(new) == len(snew)) nf += len(set(ref) & snew) print('nfound', nh, nbit, nf) nfound[(nh, nbit)] = nf self.assertGreater(nfound[(nh, 4)], nfound[(nh, 7)]) # test serialization index2 = faiss.deserialize_index_binary( faiss.serialize_index_binary(index)) D2, I2 = index2.search(xq, k) np.testing.assert_array_equal(Inew, I2) np.testing.assert_array_equal(Dnew, D2) print('nfound=', nfound) self.assertGreater(3, abs(nfound[(0, 7)] - nfound[(1, 7)])) self.assertGreater(nfound[(3, 7)], nfound[(1, 7)]) self.assertGreater(nfound[(5, 7)], nfound[(3, 7)])
def subtest_result_order(self, nh): d = 128 nq = 10 nb = 200 (_, xb, xq) = make_binary_dataset(d, 0, nb, nq) nbit = 10 if nh == 0: index = faiss.IndexBinaryHash(d, nbit) else: index = faiss.IndexBinaryMultiHash(d, nh, nbit) index.add(xb) index.nflip = 5 k = 10 Do, Io = index.search(xq, k) self.assertTrue(np.all(Do[:, 1:] >= Do[:, :-1]))
def test_multihash(self): d = 128 nq = 100 nb = 2000 (_, xb, xq) = make_binary_dataset(d, 0, nb, nq) index_ref = faiss.IndexBinaryFlat(d) index_ref.add(xb) radius = 55 Lref, Dref, Iref = index_ref.range_search(xq, radius) print("nb res: ", Lref[-1]) nfound = [] ndis = [] for nh in 1, 3, 5: index = faiss.IndexBinaryMultiHash(d, nh, 10) index.add(xb) # index.display() stats = faiss.cvar.indexBinaryHash_stats index.nflip = 2 stats.reset() Lnew, Dnew, Inew = index.range_search(xq, radius) for i in range(nq): ref = Iref[Lref[i]:Lref[i + 1]] new = Inew[Lnew[i]:Lnew[i + 1]] snew = set(new) # no duplicates self.assertTrue(len(new) == len(snew)) # subset of real results self.assertTrue(snew <= set(ref)) nfound.append(Lnew[-1]) ndis.append(stats.ndis) print('nfound=', nfound) print('ndis=', ndis) nfound = np.array(nfound) # self.assertTrue(nfound[-1] == Lref[-1]) self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
def create( hashes: t.Iterable[PDQ_HASH_TYPE], nhash: int = 16, custom_ids: t.Iterable[int] = None, ) -> "PDQMultiHashIndex": """ Creates a PDQMultiHashIndex for use searching against the provided hashes. Parameters ---------- hashes: sequence of PDQ Hashes The PDQ hashes to create the index with nhash: int (optional) Optional number of hashmaps for the underlaying faiss index to use for the Multi-Index Hashing lookups. custom_ids: sequence of custom ids for the PDQ Hashes (optional) Optional sequence of custom id values to use for the PDQ hashes for any method relating to indexes (e.g., hash_at). If provided, the nth item in custom_ids will be used as the id for the nth hash in hashes. If not provided then the ids for the hashes will be assumed to be their respective index in hashes (i.e., the nth hash would have id n, starting from 0). Returns ------- a PDQMultiHashIndex of these hashes """ hash_bytes = [binascii.unhexlify(hash) for hash in hashes] vectors = list( map(lambda h: numpy.frombuffer(h, dtype=numpy.uint8), hash_bytes) ) bits_per_hashmap = BITS_IN_PDQ // nhash index = faiss.IndexBinaryMultiHash(BITS_IN_PDQ, nhash, bits_per_hashmap) if vectors: if custom_ids != None: index = faiss.IndexBinaryIDMap2(index) i64_ids = list(map(uint64_to_int64, custom_ids)) index.add_with_ids(numpy.array(vectors), numpy.array(i64_ids)) else: index.add(numpy.array(vectors)) return PDQMultiHashIndex(index)
def __init__(self, nhash: int = 16): bits_per_hashmap = BITS_IN_PDQ // nhash faiss_index = faiss.IndexBinaryIDMap2( faiss.IndexBinaryMultiHash(BITS_IN_PDQ, nhash, bits_per_hashmap)) super().__init__(faiss_index) self.__construct_index_rev_map()