def test_buffer_dam_add_or_update(tmpdir): dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=6) docs = list(random_docs(8)) dam.extend(docs[:5]) doc1 = docs[0] doc1.content = 'new' # doc1 already exists => update dam.buffer_pool.add_or_update(doc1.id, doc1) assert dam[0].content == doc1.content assert len(dam.buffer_pool.buffer) == 5 # doc does not exist => add to buffer dam.buffer_pool.add_or_update(docs[5].id, docs[5]) assert len(dam.buffer_pool.buffer) == 6 # buffer is full => remove the LRU (docs[1], because docs[0] was used before) dam.buffer_pool.add_or_update(docs[6].id, docs[6]) assert docs[6].id in dam.buffer_pool assert docs[1].id not in dam.buffer_pool del dam.buffer_pool[docs[4].id] # spot number 4 becomes empty assert 4 in dam.buffer_pool._empty dam.buffer_pool.add_or_update(docs[7].id, docs[7]) assert dam.buffer_pool.doc_map[docs[7].id][0] == 4
class MyIndexer(Executor): """Simple indexer class """ def __init__(self, **kwargs): super().__init__(**kwargs) self._docs = DocumentArrayMemmap(self.workspace + '/indexer') @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs): self._docs.extend(docs) @requests(on='/search') def search(self, docs: 'DocumentArray', **kwargs): """Append best matches to each document in docs :param docs: documents that are searched :param parameters: dictionary of pairs (parameter,value) :param kwargs: other keyword arguments """ docs.match( self._docs, metric='cosine', normalization=(1, 0), limit=1, )
def test_texts_getter_dam(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.extend([Document(text='hello') for _ in range(100)]) assert len(dam.texts) == 100 t1 = dam.texts t2 = dam.get_attributes('text') assert t1 == t2
def test_embeddings_wrong_len(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.extend([Document() for x in range(100)]) embeddings = np.ones((2, 10, 10)) with pytest.raises(ValueError, match='the number of rows in the'): dam.embeddings = embeddings
def test_setter_wrong_len(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.extend([Document() for _ in range(100)]) tags = [{'1': 2}] with pytest.raises(ValueError, match='the number of tags in the'): dam.tags = tags
def test_memmap_save_reload(tmpdir): docs = list(random_docs(100)) dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=100) dam.extend(docs) dam1 = DocumentArrayMemmap(tmpdir) for doc in docs: doc.content = 'new' for doc in dam: # from memory assert doc.content == 'new' # from disk assert dam._get_doc_by_key(doc.id).content == 'hello world' # dam1 from disk (empty memory buffer + dam not persisted) for doc in dam1: assert doc.content == 'hello world' dam.flush() dam1.reload() # dam from disk for doc in dam: assert dam._get_doc_by_key(doc.id).content == 'new' # dam1 up-to-date for doc in dam1: assert doc.content == 'new'
def test_texts_wrong_len(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.extend([Document() for _ in range(100)]) texts = ['hello'] with pytest.raises(ValueError): dam.texts = texts
def test_blobs_wrong_len(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.extend([Document() for x in range(100)]) blobs = np.ones((2, 10, 10)) with pytest.raises(ValueError): dam.blobs = blobs
class MyIndexer(Executor): """ Executor with basic exact search using cosine distance """ def __init__(self, **kwargs): super().__init__(**kwargs) self._docs = DocumentArrayMemmap(self.workspace + '/indexer') @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs): """Extend self._docs :param docs: DocumentArray containing Documents :param kwargs: other keyword arguments """ self._docs.extend(docs) @requests(on=['/search', '/eval']) def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs): """Append best matches to each document in docs :param docs: documents that are searched :param parameters: dictionary of pairs (parameter,value) :param kwargs: other keyword arguments """ docs.match( self._docs, metric='cosine', normalization=(1, 0), limit=int(parameters['top_k']), )
def test_match_handle_different_limit(get_two_docarray, limit, tmpdir): da1, da2 = get_two_docarray dam = DocumentArrayMemmap(tmpdir) dam.extend(da2) da1.match(dam, limit=limit) expected_length = limit if limit not in [None, -1] else len(da2) assert len(da1[0].matches) == expected_length
def test_scipy_dist(docarrays_for_embedding_distance_computation, normalization, metric, tmpdir, only_id): D1, D2 = docarrays_for_embedding_distance_computation D1_ = copy.deepcopy(D1) D2_ = copy.deepcopy(D2) D1.match(D2, metric=metric, limit=3, normalization=normalization, use_scipy=True) values_docarray = [m.scores[metric].value for d in D1 for m in d.matches] D2memmap = DocumentArrayMemmap(tmpdir) D2memmap.extend(D2_) D1_.match( D2memmap, metric=metric, limit=3, normalization=normalization, use_scipy=True, only_id=only_id, ) values_docarraymemmap = [ m.scores[metric].value for d in D1_ for m in d.matches ] np.testing.assert_equal(values_docarray, values_docarraymemmap)
def test_persist(tmpdir): dam = DocumentArrayMemmap(tmpdir) docs = list(random_docs(100)) for doc in docs: doc.scores['score'] = 50 doc.evaluations['eval'] = 100 dam.extend(docs) dam2 = DocumentArrayMemmap(tmpdir) assert len(dam2) == 100 assert dam == dam2 for d1, d2 in zip(dam, dam2): assert d1.proto == d2.proto assert '1' in dam del dam['1'] assert len(dam2) == 100 dam2.reload() assert len(dam2) == 99 for doc2 in dam2: assert doc2.scores['score'].value == 50 assert doc2.evaluations['eval'].value == 100 dam.clear() assert len(dam2) == 99 dam2.reload() assert len(dam2) == 0
def test_traverse(tmpdir, mocker): dam = DocumentArrayMemmap(tmpdir) dam.extend(random_docs(100)) mock = mocker.Mock() for c in dam.traverse_flat(['c']): assert c.granularity == 1 mock() mock.assert_called()
def test_buffer_dam_clear(tmpdir): dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=5) docs = list(random_docs(5)) dam.extend(docs) dam.buffer_pool.clear() for doc in docs: assert doc.id not in dam.buffer_pool
def test_blobs_setter_dam(tmpdir): blobs = np.random.random((100, 10, 10)) dam = DocumentArrayMemmap(tmpdir) dam.extend([Document() for _ in blobs]) dam.blobs = blobs np.testing.assert_almost_equal(dam.blobs, blobs) for x, doc in zip(blobs, dam): np.testing.assert_almost_equal(x, doc.blob)
def test_memmap_update_in_memory(tmpdir): dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=100) candidates = list(random_docs(100)) dam.extend(candidates) for idx, candidate in enumerate(candidates): candidate.content = f'new content {idx}' for idx, doc in enumerate(dam): assert doc.content == f'new content {idx}'
def test_sample(tmpdir): da = DocumentArrayMemmap(tmpdir) docs = list(random_docs(100)) da.extend(docs) sampled = da.sample(5) assert len(sampled) == 5 assert isinstance(sampled, DocumentArray) with pytest.raises(ValueError): da.sample(101)
def test_texts_setter_dam(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.extend([Document() for _ in range(100)]) texts = ['text' for _ in range(100)] dam.texts = texts assert dam.texts == texts for x, doc in zip(texts, dam): assert x == doc.text
def test_embeddings_setter_dam(tmpdir): emb = np.random.random((100, 128)) dam = DocumentArrayMemmap(tmpdir) dam.extend([Document() for _ in range(100)]) dam.embeddings = emb np.testing.assert_almost_equal(dam.embeddings, emb) for x, doc in zip(emb, dam): np.testing.assert_almost_equal(x, doc.embedding)
def test_tags_setter_dam(tmpdir): dam = DocumentArrayMemmap(tmpdir) tags = [{'a': 2, 'c': 'd'} for _ in range(100)] dam.extend([Document() for _ in range(100)]) dam.tags = tags assert dam.tags == tags for x, doc in zip(tags, dam): assert x == doc.tags
def test_shuffle_with_seed(tmpdir): da = DocumentArrayMemmap(tmpdir) docs = list(random_docs(100)) da.extend(docs) shuffled_1 = da.shuffle(seed=1) shuffled_2 = da.shuffle(seed=1) shuffled_3 = da.shuffle(seed=2) assert len(shuffled_1) == len(shuffled_2) == len(shuffled_3) == len(da) assert shuffled_1 == shuffled_2 assert shuffled_1 != shuffled_3
def test_memmap_update_document(tmpdir): dam = DocumentArrayMemmap(tmpdir) candidates = list(random_docs(100)) dam.extend(candidates) for idx, candidate in enumerate(candidates): candidate.content = f'new content {idx}' dam[idx] = candidate for idx, doc in enumerate(dam): assert doc.content == f'new content {idx}'
def test_memmap_buffer_synched(tmpdir): docs = list(random_docs(100)) dam = DocumentArrayMemmap(tmpdir) dam.extend(docs[:50]) for i, doc in enumerate(docs[50:]): dam[i] = doc assert dam._buffer_pool[doc.id].id == dam[i].id doc.content = 'new' assert dam[doc.id].content == 'new'
def test_prune_save_space(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.extend(random_docs(100)) old_hsize = os.stat(os.path.join(tmpdir, 'header.bin')).st_size old_bsize = os.stat(os.path.join(tmpdir, 'body.bin')).st_size del dam['2'] dam.prune() new_hsize = os.stat(os.path.join(tmpdir, 'header.bin')).st_size new_bsize = os.stat(os.path.join(tmpdir, 'body.bin')).st_size assert new_bsize < old_bsize assert new_hsize < old_hsize
def test_shuffle(tmpdir): da = DocumentArrayMemmap(tmpdir) docs = list(random_docs(100)) da.extend(docs) shuffled = da.shuffle() assert len(shuffled) == len(da) assert isinstance(shuffled, DocumentArray) ids_before_shuffle = [d.id for d in da] ids_after_shuffle = [d.id for d in shuffled] assert ids_before_shuffle != ids_after_shuffle assert sorted(ids_before_shuffle) == sorted(ids_after_shuffle)
def test_convert_dm_to_dam(tmpdir, mocker): dam = DocumentArrayMemmap(tmpdir) da = DocumentArray(random_docs(100)) dam.extend(da) da.clear() mock = mocker.Mock() for d in dam: assert d mock() mock.assert_called() assert len(da) == 0 assert len(dam) == 100
def test_buffer_dam_delete(tmpdir): dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=5) docs = list(random_docs(6)) dam.extend(docs) first_doc = docs[0] # the first element should be out of buffer with pytest.raises(KeyError): del dam.buffer_pool[first_doc.id] # no exception raised dam.buffer_pool.delete_if_exists(first_doc.id)
def test_memmap_append_extend(tmpdir): dam = DocumentArrayMemmap(tmpdir) docs = list(random_docs(100)) assert len(dam) == 0 for d in docs[:40]: dam.append(d) assert len(dam) == 40 for d1, d2 in zip(docs[:40], dam): assert d1.proto == d2.proto dam.extend(docs[40:]) assert len(dam) == 100 for d1, d2 in zip(docs, dam): assert d1.proto == d2.proto
def test_buffer_dam_getitem(tmpdir): dam = DocumentArrayMemmap(tmpdir) docs = list(random_docs(10)) dam.extend(docs) for i, doc in enumerate(docs): # assert same doc when getting by key assert dam.buffer_pool[doc.id].content_hash == doc.content_hash assert dam.buffer_pool[doc.id].id == doc.id with pytest.raises(TypeError): dam.buffer_pool[1:5] with pytest.raises(TypeError): dam.buffer_pool[0]
def test_buffers_getter_setter(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.extend([ Document(buffer=b'aa'), Document(buffer=b'bb'), Document(buffer=b'cc'), ]) assert dam.buffers == [b'aa', b'bb', b'cc'] dam.buffers = [b'cc', b'bb', b'aa'] assert dam.buffers == [b'cc', b'bb', b'aa'] with pytest.raises(ValueError): dam.buffers = [b'cc', b'bb', b'aa', b'dd'] with pytest.raises(TypeError): dam.buffers = ['aa', 'bb', 'cc']