def test_dbms_keyvalue(tmpdir, test_metas): docs = list(get_documents(chunks=False, nr=10, same_content=True)) ids, vecs, meta = zip(*[(doc.id, doc.embedding, _doc_without_embedding(doc).SerializeToString()) for doc in docs]) save_path = None with KeyValueDBMSIndexer(index_filename='dbms', metas=test_metas) as indexer: indexer.add(ids, vecs, meta) assert indexer.size == len(docs) save_path = indexer.save_abspath new_docs = list(get_documents(chunks=False, nr=10, same_content=False)) ids, vecs, meta = zip(*[(doc.id, doc.embedding, _doc_without_embedding(doc).SerializeToString()) for doc in new_docs]) # assert contents update with BaseDBMSIndexer.load(save_path) as indexer: indexer.update(ids, vecs, meta) assert indexer.size == len(docs) # assert contents update with BaseDBMSIndexer.load(save_path) as indexer: indexer.delete([d.id for d in docs]) assert indexer.size == 0
def test_cache_driver_multiple_fields(test_metas): docs1 = list(get_documents(0, same_content=True, same_tag_content=False, index_start=0)) docs2 = list(get_documents(0, same_content=True, same_tag_content=False, index_start=0)) filename = 'cache' test_metas['name'] = filename driver = MockBaseCacheDriver() with DocCache(filename, metas=test_metas, fields=(CONTENT_HASH_KEY, 'tags__tag_field')) as executor: driver.attach(executor=executor, runtime=None) driver._apply_all(docs1) with pytest.raises(NotImplementedError): driver._apply_all(docs2) assert executor.size == len(docs1) with BaseExecutor.load(executor.save_abspath) as executor: driver.attach(executor=executor, runtime=None) with pytest.raises(NotImplementedError): driver._apply_all(docs1) assert executor.size == len(docs1) # switching order doesn't matter with DocCache(metas=test_metas, fields=('tags__tag_field', CONTENT_HASH_KEY,)) as executor: driver.attach(executor=executor, runtime=None) with pytest.raises(NotImplementedError): driver._apply_all(docs1) with pytest.raises(AssertionError): # TODO(cristian): size should be loaded if there is an existing cache? assert executor.size == len(docs1)
def test_dbms_keyvalue(tmpdir, test_metas): docs = list(get_documents(chunks=False, nr=10, same_content=True)) ids, vecs, metas = _get_ids_vecs_meta(docs) save_path = None with KeyValueDBMSIndexer(index_filename='dbms', metas=test_metas) as indexer: indexer.add(ids, vecs, metas) assert indexer.size == len(docs) save_path = indexer.save_abspath indexer.dump(os.path.join(tmpdir, 'dump1'), 2) # we can index and dump again in the same context docs2 = list( get_documents(chunks=False, nr=10, same_content=True, index_start=len(docs))) ids, vecs, metas = _get_ids_vecs_meta(docs2) indexer.add(ids, vecs, metas) assert indexer.size == 2 * len(docs) indexer.dump(os.path.join(tmpdir, 'dump2'), 3) new_docs = list(get_documents(chunks=False, nr=10, same_content=False)) ids, vecs, meta = zip(*[( doc.id, doc.embedding, DBMSIndexDriver._doc_without_embedding(doc).SerializeToString(), ) for doc in new_docs]) # assert contents update with BaseDBMSIndexer.load(save_path) as indexer: indexer.update(ids, vecs, meta) assert indexer.size == 2 * len(docs) # assert contents update with BaseDBMSIndexer.load(save_path) as indexer: indexer.delete([d.id for d in docs]) assert indexer.size == len(docs)
def test_cache_crud(tmp_path, mocker, indexers, field, shards, chunks, same_content): flow_index = get_index_flow(field=field, tmp_path=tmp_path, shards=shards, indexers=indexers) flow_query = get_query_flow(field=field, tmp_path=tmp_path, shards=shards) flow_delete = get_delete_flow(field=field, tmp_path=tmp_path, shards=shards, indexers=indexers) def validate_result_factory(num_matches): def validate_results(resp): assert len(resp.docs) == DOCS_TO_SEARCH for d in resp.docs: matches = list(d.matches) # this differs depending on cache settings # it could be lower if num_matches != 0: if field == 'content_hash' and same_content: if chunks: assert len(matches) == 2 else: assert len(matches) == 1 else: assert len(matches) == num_matches return validate_results docs = list( get_documents(chunks=chunks, same_content=same_content, nr=DOCS_TO_INDEX)) # ids in order to ensure no matches in KV search_docs = list( get_documents(chunks=0, same_content=False, nr=DOCS_TO_SEARCH, index_start=9999)) # INDEX with flow_index as f: f.index(docs, request_size=REQUEST_SIZE) check_indexers_size(chunks, len(docs), field, tmp_path, same_content, shards, 'index') # INDEX (with new documents) chunks_ids = np.concatenate([d.chunks for d in docs]) index_start_new_docs = 1 + len(docs) + len(chunks_ids) new_docs = list( get_documents(chunks=chunks, same_content=same_content, index_start=index_start_new_docs)) with flow_index as f: f.index(new_docs, request_size=REQUEST_SIZE) check_indexers_size(chunks, len(docs), field, tmp_path, same_content, shards, 'index2') # QUERY mock = mocker.Mock() with flow_query as f: f.search(search_docs, on_done=mock) mock.assert_called_once() validate_callback(mock, validate_result_factory(TOP_K)) # UPDATE docs.extend(new_docs) del new_docs # id stays the same, we change the content for d in docs: d_content_hash_before = d.content_hash d.content = f'this is some new content for doc {d.id}' d.update_content_hash() assert d.content_hash != d_content_hash_before for chunk in d.chunks: c_content_hash_before = chunk.content_hash chunk.content = f'this is some new content for chunk {chunk.id}' chunk.update_content_hash() assert chunk.content_hash != c_content_hash_before with flow_index as f: f.update(docs) check_indexers_size(chunks, len(docs) / 2, field, tmp_path, same_content, shards, 'index2') # QUERY mock = mocker.Mock() with flow_query as f: f.search(search_docs, on_done=mock) mock.assert_called_once() validate_callback(mock, validate_result_factory(TOP_K)) # DELETE delete_ids = [] for d in docs: delete_ids.append(d.id) for c in d.chunks: delete_ids.append(c.id) with flow_delete as f: f.delete(delete_ids) check_indexers_size(chunks, 0, field, tmp_path, same_content, shards, 'delete') # QUERY mock = mocker.Mock() with flow_query as f: f.search(search_docs, on_done=mock) mock.assert_called_once() validate_callback(mock, validate_result_factory(0))