def test_buffer_dam_add_or_update(tmpdir): dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=6) docs = list(random_docs(8)) dam.extend(docs[:5]) doc1 = docs[0] doc1.content = 'new' # doc1 already exists => update dam.buffer_pool.add_or_update(doc1.id, doc1) assert dam[0].content == doc1.content assert len(dam.buffer_pool.buffer) == 5 # doc does not exist => add to buffer dam.buffer_pool.add_or_update(docs[5].id, docs[5]) assert len(dam.buffer_pool.buffer) == 6 # buffer is full => remove the LRU (docs[1], because docs[0] was used before) dam.buffer_pool.add_or_update(docs[6].id, docs[6]) assert docs[6].id in dam.buffer_pool assert docs[1].id not in dam.buffer_pool del dam.buffer_pool[docs[4].id] # spot number 4 becomes empty assert 4 in dam.buffer_pool._empty dam.buffer_pool.add_or_update(docs[7].id, docs[7]) assert dam.buffer_pool.doc_map[docs[7].id][0] == 4
class MyIndexer(Executor): """Simple indexer class """ def __init__(self, **kwargs): super().__init__(**kwargs) self._docs = DocumentArrayMemmap(self.workspace + '/indexer') @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs): self._docs.extend(docs) @requests(on='/search') def search(self, docs: 'DocumentArray', **kwargs): """Append best matches to each document in docs :param docs: documents that are searched :param parameters: dictionary of pairs (parameter,value) :param kwargs: other keyword arguments """ docs.match( self._docs, metric='cosine', normalization=(1, 0), limit=1, )
def test_setter_wrong_len(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.extend([Document() for _ in range(100)]) tags = [{'1': 2}] with pytest.raises(ValueError, match='the number of tags in the'): dam.tags = tags
def test_blobs_wrong_len(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.extend([Document() for x in range(100)]) blobs = np.ones((2, 10, 10)) with pytest.raises(ValueError): dam.blobs = blobs
def test_texts_wrong_len(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.extend([Document() for _ in range(100)]) texts = ['hello'] with pytest.raises(ValueError): dam.texts = texts
class MyIndexer(Executor): """ Executor with basic exact search using cosine distance """ def __init__(self, **kwargs): super().__init__(**kwargs) self._docs = DocumentArrayMemmap(self.workspace + '/indexer') @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs): """Extend self._docs :param docs: DocumentArray containing Documents :param kwargs: other keyword arguments """ self._docs.extend(docs) @requests(on=['/search', '/eval']) def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs): """Append best matches to each document in docs :param docs: documents that are searched :param parameters: dictionary of pairs (parameter,value) :param kwargs: other keyword arguments """ docs.match( self._docs, metric='cosine', normalization=(1, 0), limit=int(parameters['top_k']), )
def test_scipy_dist(docarrays_for_embedding_distance_computation, normalization, metric, tmpdir, only_id): D1, D2 = docarrays_for_embedding_distance_computation D1_ = copy.deepcopy(D1) D2_ = copy.deepcopy(D2) D1.match(D2, metric=metric, limit=3, normalization=normalization, use_scipy=True) values_docarray = [m.scores[metric].value for d in D1 for m in d.matches] D2memmap = DocumentArrayMemmap(tmpdir) D2memmap.extend(D2_) D1_.match( D2memmap, metric=metric, limit=3, normalization=normalization, use_scipy=True, only_id=only_id, ) values_docarraymemmap = [ m.scores[metric].value for d in D1_ for m in d.matches ] np.testing.assert_equal(values_docarray, values_docarraymemmap)
def test_match_handle_different_limit(get_two_docarray, limit, tmpdir): da1, da2 = get_two_docarray dam = DocumentArrayMemmap(tmpdir) dam.extend(da2) da1.match(dam, limit=limit) expected_length = limit if limit not in [None, -1] else len(da2) assert len(da1[0].matches) == expected_length
def test_embeddings_wrong_len(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.extend([Document() for x in range(100)]) embeddings = np.ones((2, 10, 10)) with pytest.raises(ValueError, match='the number of rows in the'): dam.embeddings = embeddings
def test_texts_getter_dam(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.extend([Document(text='hello') for _ in range(100)]) assert len(dam.texts) == 100 t1 = dam.texts t2 = dam.get_attributes('text') assert t1 == t2
def test_blobs_setter_dam(tmpdir): blobs = np.random.random((100, 10, 10)) dam = DocumentArrayMemmap(tmpdir) dam.extend([Document() for _ in blobs]) dam.blobs = blobs np.testing.assert_almost_equal(dam.blobs, blobs) for x, doc in zip(blobs, dam): np.testing.assert_almost_equal(x, doc.blob)
def test_buffer_dam_clear(tmpdir): dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=5) docs = list(random_docs(5)) dam.extend(docs) dam.buffer_pool.clear() for doc in docs: assert doc.id not in dam.buffer_pool
def test_traverse(tmpdir, mocker): dam = DocumentArrayMemmap(tmpdir) dam.extend(random_docs(100)) mock = mocker.Mock() for c in dam.traverse_flat(['c']): assert c.granularity == 1 mock() mock.assert_called()
def test_embeddings_setter_dam(tmpdir): emb = np.random.random((100, 128)) dam = DocumentArrayMemmap(tmpdir) dam.extend([Document() for _ in range(100)]) dam.embeddings = emb np.testing.assert_almost_equal(dam.embeddings, emb) for x, doc in zip(emb, dam): np.testing.assert_almost_equal(x, doc.embedding)
def test_tags_setter_dam(tmpdir): dam = DocumentArrayMemmap(tmpdir) tags = [{'a': 2, 'c': 'd'} for _ in range(100)] dam.extend([Document() for _ in range(100)]) dam.tags = tags assert dam.tags == tags for x, doc in zip(tags, dam): assert x == doc.tags
def test_memmap_update_in_memory(tmpdir): dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=100) candidates = list(random_docs(100)) dam.extend(candidates) for idx, candidate in enumerate(candidates): candidate.content = f'new content {idx}' for idx, doc in enumerate(dam): assert doc.content == f'new content {idx}'
def test_texts_setter_dam(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.extend([Document() for _ in range(100)]) texts = ['text' for _ in range(100)] dam.texts = texts assert dam.texts == texts for x, doc in zip(texts, dam): assert x == doc.text
def test_memmap_update_document(tmpdir): dam = DocumentArrayMemmap(tmpdir) candidates = list(random_docs(100)) dam.extend(candidates) for idx, candidate in enumerate(candidates): candidate.content = f'new content {idx}' dam[idx] = candidate for idx, doc in enumerate(dam): assert doc.content == f'new content {idx}'
def test_memmap_buffer_synched(tmpdir): docs = list(random_docs(100)) dam = DocumentArrayMemmap(tmpdir) dam.extend(docs[:50]) for i, doc in enumerate(docs[50:]): dam[i] = doc assert dam._buffer_pool[doc.id].id == dam[i].id doc.content = 'new' assert dam[doc.id].content == 'new'
def test_error(tmpdir): dam = DocumentArrayMemmap(tmpdir) dam.clear() with pytest.raises(KeyError): dam['12'] with pytest.raises(IndexError): dam[1] with pytest.raises(IndexError): del dam[1] with pytest.raises(KeyError): del dam['12']
def test_shuffle(tmpdir): da = DocumentArrayMemmap(tmpdir) docs = list(random_docs(100)) da.extend(docs) shuffled = da.shuffle() assert len(shuffled) == len(da) assert isinstance(shuffled, DocumentArray) ids_before_shuffle = [d.id for d in da] ids_after_shuffle = [d.id for d in shuffled] assert ids_before_shuffle != ids_after_shuffle assert sorted(ids_before_shuffle) == sorted(ids_after_shuffle)
def test_convert_dm_to_dam(tmpdir, mocker): dam = DocumentArrayMemmap(tmpdir) da = DocumentArray(random_docs(100)) dam.extend(da) da.clear() mock = mocker.Mock() for d in dam: assert d mock() mock.assert_called() assert len(da) == 0 assert len(dam) == 100
def test_buffer_dam_delete(tmpdir): dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=5) docs = list(random_docs(6)) dam.extend(docs) first_doc = docs[0] # the first element should be out of buffer with pytest.raises(KeyError): del dam.buffer_pool[first_doc.id] # no exception raised dam.buffer_pool.delete_if_exists(first_doc.id)
def doc_lists_to_doc_arrays(doc_lists, tmpdir, first_memmap, second_memmap, buffer_pool_size): doc_list1, doc_list2 = doc_lists tmpdir1, tmpdir2 = tmpdir / '1', tmpdir / '2' D1 = (DocumentArray() if not first_memmap else DocumentArrayMemmap( tmpdir1, buffer_pool_size=buffer_pool_size)) D1.extend(doc_list1) D2 = (DocumentArray() if not second_memmap else DocumentArrayMemmap( tmpdir2, buffer_pool_size=buffer_pool_size)) D2.extend(doc_list2) return D1, D2
def test_batch_iterator_dam(tmpdir): dam = DocumentArrayMemmap(tmpdir) for i in range(4): dam.append(Document(id=i)) bi = batch_iterator(dam, 2) expected_iterator = iter(range(4)) for batch in bi: for doc in batch: assert int(doc.id) == next(expected_iterator) # expect that expected_iterator is totally consumed with pytest.raises(StopIteration): next(expected_iterator)
def test_buffer_dam_getitem(tmpdir): dam = DocumentArrayMemmap(tmpdir) docs = list(random_docs(10)) dam.extend(docs) for i, doc in enumerate(docs): # assert same doc when getting by key assert dam.buffer_pool[doc.id].content_hash == doc.content_hash assert dam.buffer_pool[doc.id].id == doc.id with pytest.raises(TypeError): dam.buffer_pool[1:5] with pytest.raises(TypeError): dam.buffer_pool[0]
class KeyValueIndexer(Executor): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._docs = DocumentArrayMemmap(self.workspace + '/kv-idx') @requests(on='/index') def index(self, docs: DocumentArray, **kwargs): self._docs.extend(docs) @requests(on='/search') def query(self, docs: DocumentArray, **kwargs): for doc in docs: for match in doc.matches: extracted_doc = self._docs[match.parent_id] match.update(extracted_doc)
def memmap_for_split(tmpdir): da = DocumentArrayMemmap(tmpdir) da.append(Document(tags={'category': 'c'})) da.append(Document(tags={'category': 'c'})) da.append(Document(tags={'category': 'b'})) da.append(Document(tags={'category': 'a'})) da.append(Document(tags={'category': 'a'})) return da
def test_memmap_delete_by_slice(tmpdir): dam = DocumentArrayMemmap(tmpdir) candidates = list(random_docs(100)) for d in candidates: d.id = f'id_{d.id}' dam.extend(candidates) assert len(dam) == 100 del dam[-5:] assert len(dam) == 95 del dam[:5] assert len(dam) == 90 for candidate in candidates[:5] + candidates[-5:]: for d in dam: assert d.id != candidate.id
class DocVectorIndexer(Executor): def __init__(self, index_file_name: str, **kwargs): super().__init__(**kwargs) self._docs = DocumentArrayMemmap(self.workspace + f'/{index_file_name}') @requests(on='/index') def index(self, docs: 'DocumentArray', **kwargs): self._docs.extend(docs) @requests(on='/search') def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs): docs.match( self._docs, metric='cosine', normalization=(1, 0), limit=int(parameters['top_k']), )