def test_multimodal_driver_with_shuffled_order(simple_multimodal_driver, mock_multimodal_encoder_shuffled, doc_with_multimodal_chunks): simple_multimodal_driver.attach(executor=mock_multimodal_encoder_shuffled, pea=None) simple_multimodal_driver._apply_all(DocumentSet([doc_with_multimodal_chunks])) doc = doc_with_multimodal_chunks assert len(doc.chunks) == 3 visual1 = doc.chunks[2] visual2 = doc.chunks[0] textual = doc.chunks[1] control = np.concatenate([visual2.embedding, textual.embedding, visual1.embedding]) test = doc.embedding np.testing.assert_array_equal(control, test)
def test_multimodal_driver(simple_multimodal_driver, mock_multimodal_encoder, doc_with_multimodal_chunks): simple_multimodal_driver.attach(executor=mock_multimodal_encoder, runtime=None) simple_multimodal_driver._apply_all( DocumentSet([doc_with_multimodal_chunks])) doc = doc_with_multimodal_chunks assert len(doc.chunks) == 3 visual1 = doc.chunks[0] visual2 = doc.chunks[1] textual = doc.chunks[2] assert doc.embedding.shape[0] == visual1.embedding.shape[0] + \ visual2.embedding.shape[0] + textual.embedding.shape[0]
def test_cache_driver_from_file(tmpdir, test_metas): filename = 'cache' test_metas['name'] = filename folder = os.path.join(test_metas["workspace"]) folder = os.path.join(folder, 'cache-0') os.makedirs(folder) bin_full_path = os.path.join(folder, filename) docs = DocumentSet(list(random_docs(10, embedding=False))) pickle.dump( { doc.id: BaseCacheDriver.hash_doc(doc, ['content_hash']) for doc in docs }, open(f'{bin_full_path}.bin.ids', 'wb'), ) pickle.dump( { BaseCacheDriver.hash_doc(doc, ['content_hash']): doc.id for doc in docs }, open(f'{bin_full_path}.bin.cache', 'wb'), ) driver = MockCacheDriver() with DocCache(metas=test_metas, fields=(CONTENT_HASH_KEY, )) as executor: assert not executor.handler_mutex driver.attach(executor=executor, runtime=None) with pytest.raises(NotImplementedError): # duplicate docs driver._apply_all(docs) # new docs docs = DocumentSet(list(random_docs(10, start_id=100))) driver._apply_all(docs) # check persistence assert os.path.exists(executor.save_abspath)
def test_vectorsearch_driver_mock_indexer_with_matches_on_chunks(document_with_matches_on_chunks): driver = SimpleKVSearchDriver(traversal_paths=('cm',)) executor = MockIndexer() driver.attach(executor=executor, runtime=None) driver._traverse_apply(DocumentSet([document_with_matches_on_chunks])) dcs = list(document_with_matches_on_chunks.chunks) assert len(dcs) == 1 chunk = dcs[0] matches = list(chunk.matches) assert len(matches) == 3 for match in matches: assert NdArray(match.embedding).value is not None embedding_array = NdArray(match.embedding).value np.testing.assert_equal(embedding_array, np.array([match.id]))
def test_extract_bad_fields_no_strict_args(mocker): encode_mock = mocker.Mock() class MyExecutor(BaseEncoder): def encode(self, hello): encode_mock() exec = MyExecutor() bd = EncodeDriver(strict_method_args=False) bd.attach(exec, runtime=None) docs = list(random_docs(10)) ds = DocumentSet(docs) bd._apply_all(ds) encode_mock.assert_not_called()
def test_extract_bad_fields(mocker): encode_mock = mocker.Mock() class MyExecutor(BaseEncoder): def encode(self, data): encode_mock() exec = MyExecutor() bd = EncodeDriver() bd.attach(exec, runtime=None) docs = list(random_docs(10)) ds = DocumentSet(docs) with pytest.raises( AttributeError, match='is now deprecated and not a valid argument' ): bd._apply_all(ds) encode_mock.assert_not_called() class MyExecutor(BaseEncoder): def encode(self, hello): encode_mock() exec = MyExecutor() bd = EncodeDriver() bd.attach(exec, runtime=None) with pytest.raises(AttributeError, match='are invalid Document attributes'): bd._apply_all(ds) encode_mock.assert_not_called() class MyExecutor(BaseEncoder): def encode(self, mimeType): encode_mock() exec = MyExecutor() bd = EncodeDriver() bd.attach(exec, runtime=None) with pytest.raises(AttributeError, match='you give them in CamelCase'): bd._apply_all(ds) encode_mock.assert_not_called()
def test_cache_driver_twice(tmp_path): filename = tmp_path / 'test-tmp.bin' docs = DocumentSet(list(random_docs(10))) driver = MockCacheDriver() with DocIDCache(filename) as executor: assert not executor.handler_mutex driver.attach(executor=executor, pea=None) driver._traverse_apply(docs) with pytest.raises(NotImplementedError): # duplicate docs driver._traverse_apply(docs) # new docs docs = list(random_docs(10)) driver._traverse_apply(docs) # check persistence assert Path(filename).exists()
def test_exec_fn_arbitrary_name(mocker): encode_mock = mocker.Mock() class MyExecutor(BaseEncoder): def foo(self, id): assert isinstance(id[0], str) assert isinstance(id, list) encode_mock() exec = MyExecutor() bd = EncodeDriver(method='foo') bd.attach(exec, runtime=None) docs = list(random_docs(10)) ds = DocumentSet(docs) bd._apply_all(ds) encode_mock.assert_called()
def test_cache_driver_twice(tmpdir, test_metas): docs = DocumentSet(list(random_docs(10))) driver = MockCacheDriver() # FIXME DocIdCache doesn't use tmpdir, it saves in curdir with DocIDCache(tmpdir, metas=test_metas) as executor: assert not executor.handler_mutex driver.attach(executor=executor, runtime=None) driver._traverse_apply(docs) with pytest.raises(NotImplementedError): # duplicate docs driver._traverse_apply(docs) # new docs docs = list(random_docs(10, start_id=100)) driver._traverse_apply(docs) filename = executor.save_abspath # check persistence assert os.path.exists(filename)
def test_extract_multi_fields(mocker): encode_mock = mocker.Mock() class MyExecutor(BaseEncoder): def encode(self, id, embedding): encode_mock() assert isinstance(id, list) assert isinstance(embedding, list) assert isinstance(id[0], str) assert isinstance(embedding[0], np.ndarray) exec = MyExecutor() bd = EncodeDriver() bd.attach(exec, runtime=None) docs = list(random_docs(10)) ds = DocumentSet(docs) bd._apply_all(ds) encode_mock.assert_called()
def test_exec_fn_return_doc(mocker): encode_mock = mocker.Mock() class MyExecutor(BaseEncoder): def encode(self, id): encode_mock() return [Document(mime_type='image/png')] * len(id) exec = MyExecutor() bd = EncodeDriver() bd.attach(exec, runtime=None) docs = list(random_docs(10)) ds = DocumentSet(docs) bd._apply_all(ds) encode_mock.assert_called() for d in ds: assert d.mime_type == 'image/png'
def test_vectorsearch_driver_mock_indexer_apply_all(document): driver = SimpleKVSearchDriver() executor = MockIndexer() driver.attach(executor=executor, runtime=None) dcs = list(document.chunks) assert len(dcs) == 5 for chunk in dcs: assert chunk.embedding is None driver._apply_all([DocumentSet(document.chunks)]) dcs = list(document.chunks) # chunk idx: 5 had no matched and is removed as missing idx assert len(dcs) == 4 for chunk in dcs: assert chunk.embedding is not None embedding_array = chunk.embedding np.testing.assert_equal(embedding_array, np.array([chunk.id]))
def random_docs_with_chunks(num_docs): docs = [] for j in range(num_docs): d = jina_pb2.DocumentProto() d.granularity = 0 d.tags['id'] = j d.text = 'hello world' d.uri = 'doc://' for c in range(10): dc = d.chunks.add() dc.text = 'chunk to hello world' dc.granularity = 1 dc.uri = 'doc://chunk' dc.tags['id'] = c for cc in range(10): dcc = dc.chunks.add() dcc.text = 'nested chunk to chunk' dcc.uri = 'doc://chunk/chunk' dcc.tags['id'] = cc dcc.granularity = 2 docs.append(d) return DocumentSet(docs)
def build_docs(): """ Builds up a complete chunk-match structure, with a depth of 2 in both directions recursively. """ max_granularity = 2 max_adjacency = 2 def iterate_build(document, current_granularity, current_adjacency): if current_granularity < max_granularity: for i in range(DOCUMENTS_PER_LEVEL): chunk = add_chunk(document) iterate_build(chunk, chunk.granularity, chunk.adjacency) if current_adjacency < max_adjacency: for i in range(DOCUMENTS_PER_LEVEL): match = add_match(document) iterate_build(match, match.granularity, match.adjacency) docs = [] for base_id in range(DOCUMENTS_PER_LEVEL): document = Document() document.granularity = 0 document.adjacency = 0 docs.append(document) iterate_build(document, 0, 0) return DocumentSet(docs)
def docs_to_encode(num_docs): docs = [] for idx in range(1, num_docs + 1): doc = Document(content=np.array([idx])) docs.append(doc) return DocumentSet(docs)
def docs(self): return DocumentSet(list(random_docs(10)))
def test_invalid_document(craft_driver): invalid_document = Document(content='invalid') docs = DocumentSet([invalid_document]) with pytest.raises(AttributeError) as error: craft_driver._apply_all(docs) assert error.value.__str__() == '\'non_existing_key\' is not recognized'