def _iter_doc(self, content) -> Generator['Document', None, None]: from docarray import Document self._return_plain = True for c in content: if isinstance(c, str): self._return_plain = True _mime = mimetypes.guess_type(c)[0] if _mime and _mime.startswith('image'): yield Document(uri=c).load_uri_to_blob() else: yield Document(text=c) elif isinstance(c, Document): if c.content_type in ('text', 'blob'): self._return_plain = False yield c elif not c.blob and c.uri: c.load_uri_to_blob() self._return_plain = False yield c else: raise TypeError( f'unsupported input type {c!r} {c.content_type}') else: raise TypeError(f'unsupported input type {c!r}')
def test_conditions_filtering(tmpdir, flow): with flow: ret = flow.post( on='index', inputs=DocumentArray([ Document(text='type1', tags={'type': 1}), Document(text='type2', tags={'type': 2}), ]), ) assert len(ret) == 2 types_set = set() for doc in ret: if doc.tags['type'] == 1: assert doc.text == 'type1 processed by exec1' else: assert doc.tags['type'] == 2 assert doc.text == 'type2 processed by exec2' types_set.add(doc.tags['type']) assert types_set == {1, 2} with open(os.path.join(str(tmpdir), 'exec1', '0', f'exec1.txt'), 'r') as fp: assert fp.read() == 'type1' with open(os.path.join(str(tmpdir), 'exec2', '0', f'exec2.txt'), 'r') as fp: assert fp.read() == 'type2'
def random_docs(num_docs): for j in range(1, num_docs + 1): doc = Document() doc.text = f'i\'m dummy doc {j}' doc.offset = 1000 doc.tags['id'] = 1000 # this will be ignored yield doc
def _new_doc_from_data(data, data_type: DataInputType, **kwargs) -> Tuple['Document', 'DataInputType']: def _build_doc_from_content(): return Document(content=data, **kwargs), DataInputType.CONTENT if data_type == DataInputType.DICT: doc = Document.from_dict(data) return doc, DataInputType.DICT if data_type == DataInputType.AUTO or data_type == DataInputType.DOCUMENT: if isinstance(data, Document): # if incoming is already primitive type Document, then all good, best practice! return data, DataInputType.DOCUMENT elif isinstance(data, dict): return Document.from_dict(data), DataInputType.DICT try: d = Document(data, **kwargs) return d, DataInputType.DOCUMENT except ValueError: # AUTO has a fallback, now reconsider it as content if data_type == DataInputType.AUTO: return _build_doc_from_content() else: raise elif data_type == DataInputType.CONTENT: return _build_doc_from_content()
def segment(self, docs: DocumentArray, **kwargs): for doc in docs: text = doc.tags['caption'] uri = f'{os.environ["HW_WORKDIR"]}/people-img/{doc.tags["image"]}' chunk_text = Document(text=text, mime_type='text/plain') chunk_uri = Document(uri=uri, mime_type='image/jpeg') doc.chunks = DocumentArray([chunk_text, chunk_uri]) doc.uri = uri doc.convert_uri_to_datauri()
def test_data_type_builder_doc(builder, input_data_type, output_data_type): a = Document() a.id = 'a236cbb0eda62d58' a.text = 'text test' d, t = _new_doc_from_data(builder(a), input_data_type) if input_data_type != DataInputType.CONTENT: assert d.id == a.id assert d.text == a.text assert t == output_data_type
def test_set_workspace(tmpdir): complete_workspace = os.path.abspath( os.path.join(tmpdir, 'WorkspaceExec', '0')) with Flow().add(uses=WorkspaceExec, workspace=str(tmpdir)) as f: resp = f.post(on='/foo', inputs=Document()) assert resp[0].text == complete_workspace with Flow().add(uses=WorkspaceExec, uses_metas={'workspace': str(tmpdir)}) as f: resp = f.post(on='/foo', inputs=Document()) assert resp[0].text == complete_workspace
def test_flow_default_polling_endpoints(polling): f = Flow().add(uses=DynamicPollingExecutorDefaultNames, shards=2, polling=polling) with f: docs_index = f.post(on='/index', inputs=[Document(text='1')]) docs_search = f.post(on='/search', inputs=[Document(text='1')]) docs_custom = f.post(on='/custom', inputs=[Document(text='1')]) assert len(docs_index) == 2 assert len(docs_search) == 3 assert len(docs_custom) == 3 if polling == 'all' else 2
def test_reducer_executor(n_shards, n_matches, n_chunks): reducer_executor = ReducerExecutor() query = DocumentArray([Document() for _ in range(5)]) docs_matrix = [deepcopy(query) for _ in range(n_shards)] for da in docs_matrix: for doc in da: doc.matches.extend([Document() for _ in range(n_matches)]) doc.chunks.extend([Document() for _ in range(n_chunks)]) reduced_da = reducer_executor.reduce(docs_matrix=docs_matrix) for doc in reduced_da: assert len(doc.matches) == n_shards * n_matches assert len(doc.chunks) == n_shards * n_chunks
def documents(start_index, end_index): for i in range(start_index, end_index): doc = Document() doc.text = 'this is text' doc.tags['id'] = 'id in tags' doc.tags['inner_dict'] = {'id': 'id in inner_dict'} chunk = Document() chunk.text = 'text in chunk' chunk.tags['id'] = 'id in chunk tags' doc.chunks.append(chunk) yield doc
def test_flow_default_custom_polling_endpoints(polling): custom_polling_config = {'/custom': 'ALL', '/search': 'ANY', '*': polling} f = Flow().add( uses=DynamicPollingExecutorDefaultNames, shards=2, polling=custom_polling_config, ) with f: docs_index = f.post(on='/index', inputs=[Document(text='1')]) docs_search = f.post(on='/search', inputs=[Document(text='1')]) docs_custom = f.post(on='/custom', inputs=[Document(text='1')]) assert len(docs_index) == 2 assert len(docs_search) == 2 assert len(docs_custom) == 3
def get_all_docs(client, doc_class='Document', attribute_container='serialized_doc'): s_docs = client.query.get('Document', ['serialized_doc']).do() sdocs = [s['serialized_doc'] for s in s_docs['data']['Get']['Document']] return [Document.from_base64(sdoc) for sdoc in sdocs]
def test_volumes_in_flow(tmpdir, source, destination, workspace, filewriter_exec_docker_image_built): with mock.patch.dict( os.environ, {'JINA_DEFAULT_WORKSPACE_BASE': str(os.path.join(tmpdir, 'default'))}, ): if source: # test manually set volume and workspace source = os.path.join(tmpdir, source) volumes = [str(source) + ':' + destination] else: # test auto volume and workspace volumes = None source = os.path.join(tmpdir, 'default') f = Flow().add(uses='docker://filewriter-exec', volumes=volumes, workspace=workspace) with f: f.post(inputs=[Document()], on='/foo') assert os.path.exists(source) found_output_file = False # workspace has random element, so we search for it for cur_path, dirs, files in os.walk(source): if 'out.txt' in files: with open(os.path.join(cur_path, 'out.txt'), 'r') as f: if f.read() == 'Filewriter was here': found_output_file = True assert found_output_file
def index_generator(num_docs: int, target: dict): """ Generate the index data. :param num_docs: Number of documents to be indexed. :param target: Dictionary which stores the data paths :yields: index data """ for internal_doc_id in range(num_docs): # x_blackwhite.shape is (28,28) x_blackwhite = 255 - target['index']['data'][internal_doc_id] # x_color.shape is (28,28,3) x_color = np.stack((x_blackwhite, ) * 3, axis=-1) d = Document(content=x_color) d.tags['id'] = internal_doc_id yield d
def test_complex_flow(disable_reduce): f = (Flow().add(name='first', uses=SimpleAddExecutor, needs=['gateway']).add(name='forth', uses=SimpleAddExecutor, needs=['first'], shards=2).add( name='second_shards_needs', uses=SimpleAddExecutor, needs=['gateway'], shards=2, ).add( name='third', uses=SimpleAddExecutor, shards=3, needs=['second_shards_needs'], ).add( name='merger', uses=MergeDocsExecutor, needs=['forth', 'third'], disable_reduce=disable_reduce, )) with f: docs = f.post(on='/index', inputs=[Document(text='1')]) assert len(docs) == 6 if disable_reduce else 5
def test_executor_load_from_hub(): exec = Executor.from_hub('jinahub://DummyHubExecutor', uses_metas={'name': 'hello123'}) da = DocumentArray([Document()]) exec.foo(da) assert da.texts == ['hello'] assert exec.metas.name == 'hello123'
def test_shards(): f = Flow().add(uses=SimpleAddExecutor, shards=2) with f: docs = f.post(on='/index', inputs=[Document(text='1')], return_results=True) assert len(docs) == 2
def test_grpc_compression(compression_client, compression_gateway): with Flow(grpc_compression=compression_gateway).add().add() as f: ret = f.post( on='/', inputs=DocumentArray([Document()]), grpc_compression=compression_client, ) assert len(ret) == 1
def test_blob_transmission(decode, protocol): decode = False f = Flow(protocol=protocol).add(uses=MyExec) with f: c = Client(port=f.port, protocol=protocol) d = c.post('/', Document(blob=b'hello'), parameters={'decode': decode})[0] if decode: # test that the Executor gets the correct data assert d.text == 'hello' else: # test that the response contains the correct data assert d.blob == b'hello'
def test_status(): r = DataRequest() r.docs.extend([Document()]) r.add_exception(ValueError('intentional_error')) byte_array = DataRequestProto.SerializeToString(r) deserialized_request = DataRequestProto.FromString(byte_array) assert not deserialized_request.is_decompressed assert deserialized_request.status.code == jina_pb2.StatusProto.ERROR assert deserialized_request.is_decompressed
def test_conditions_filtering_on_joiner(tmpdir): flow = (Flow().add(name='first').add( uses=ConditionDumpExecutor, uses_metas={ 'name': 'joiner_test_exec1' }, workspace=str(tmpdir), name='joiner_test_exec1', needs=['first'], ).add( uses=ConditionDumpExecutor, workspace=str(tmpdir), uses_metas={ 'name': 'joiner_test_exec2' }, name='joiner_test_exec2', needs='first', ).needs_all('joiner', when={'tags__type': { '$eq': 3 }})) with flow: ret = flow.post( on='index', inputs=DocumentArray([ Document(text='type1', tags={'type': 1}), Document(text='type2', tags={'type': 2}), ]), ) assert len(ret) == 0 with open( os.path.join(str(tmpdir), 'joiner_test_exec1', '0', f'joiner_test_exec1.txt'), 'r', ) as fp: assert fp.read() == 'type1type2' with open( os.path.join(str(tmpdir), 'joiner_test_exec2', '0', f'joiner_test_exec2.txt'), 'r', ) as fp: assert fp.read() == 'type1type2'
def get_doc_by_id(client, doc_id): result_dict = client.data_object.get_by_id(doc_id) if result_dict is None: return None else: serialized_doc = result_dict['properties']['serialized_doc'] return Document.from_base64(serialized_doc)
def test_grpc_ssl_with_flow(cert_pem, key_pem, error_log_level): with Flow( protocol='grpc', ssl_certfile=cert_pem, ssl_keyfile=key_pem, ) as f: with pytest.raises(grpc.aio._call.AioRpcError): Client(protocol='grpc', port=f.port, tls=True).index([Document()]) # the openssl error from above seems to take a bit to actually terminate and may cause the next test to seg fault time.sleep(1.0)
def test_uvicorn_ssl_with_flow(cert_pem, key_pem, protocol, capsys, error_log_level): with Flow( protocol=protocol, uvicorn_kwargs=[ 'ssl_keyfile_password: abcd', ], ssl_certfile=cert_pem, ssl_keyfile=key_pem, ) as f: with pytest.raises(aiohttp.ClientConnectorCertificateError): Client(protocol=protocol, port=f.port, tls=True).index([Document()])
def test_expected_messages_routing(disable_reduce): f = (Flow().add(name='foo', uses=SimplExecutor).add( name='bar', uses=MergeExecutor, needs=['foo', 'gateway'], disable_reduce=disable_reduce, )) with f: docs = f.post(on='/index', inputs=[Document(text='1')]) # there merge executor actually does not merge despite its name assert len(docs) == 2 if disable_reduce else 1 assert docs[0].text == 'merged' if disable_reduce else '1'
async def test_aync_data_request_handler_new_docs(logger): args = set_pod_parser().parse_args(['--uses', 'AsyncNewDocsExecutor']) handler = DataRequestHandler(args, logger) req = list( request_generator( '/', DocumentArray([Document(text='input document') for _ in range(10)])))[0] assert len(req.docs) == 10 response = await handler.handle(requests=[req]) assert len(response.docs) == 1 assert response.docs[0].text == 'new document'
def test_expected_messages_routing(): f = (Flow().add(name='foo', uses=SimplExecutor).add(name='bar', uses=MergeExecutor, needs=['foo', 'gateway'])) with f: docs = f.post(on='/index', inputs=[Document(text='1')], return_results=True) # there merge executor actually does not merge despite its name assert len(docs) == 2 assert docs[0].text == 'merged'
def test_chained_conditions(tmpdir, temp_workspace): f = (Flow().add(name='first').add( uses=ConditionDumpExecutor, uses_metas={ 'name': 'exec1' }, workspace=os.environ['TEMP_WORKSPACE'], name='exec1', needs=['first'], when={ 'tags__type': { '$gte': 2 } }, ).add( uses=ConditionDumpExecutor, workspace=os.environ['TEMP_WORKSPACE'], uses_metas={ 'name': 'exec2' }, name='exec2', needs='exec1', when={ 'tags__type': { '$lte': 1 } }, ).needs_all('joiner')) with f: ret = f.post( on='index', inputs=DocumentArray([ Document(text='type1', tags={'type': 1}), Document(text='type2', tags={'type': 2}), Document(text='type2', tags={'type': 3}), ]), ) assert len(ret) == 0
def rank( self, docs_matrix: List['DocumentArray'], parameters: Dict, **kwargs ) -> 'DocumentArray': """ :param docs_matrix: list of :class:`DocumentArray` on multiple requests to get bubbled up matches. :param parameters: the parameters passed into the ranker, in this case stores :attr`top_k` to filter k results based on score. :param kwargs: not used (kept to maintain interface) """ result_da = DocumentArray() # length: 1 as every time there is only one query for d_mod1, d_mod2 in zip(*docs_matrix): final_matches = {} # type: Dict[str, Document] for m in d_mod1.matches: relevance_score = m.scores['cosine'].value * d_mod1.weight m.scores['relevance'].value = relevance_score final_matches[m.parent_id] = Document(m, copy=True) for m in d_mod2.matches: if m.parent_id in final_matches: final_matches[m.parent_id].scores[ 'relevance' ].value = final_matches[m.parent_id].scores['relevance'].value + ( m.scores['cosine'].value * d_mod2.weight ) else: m.scores['relevance'].value = ( m.scores['cosine'].value * d_mod2.weight ) final_matches[m.parent_id] = Document(m, copy=True) da = DocumentArray(list(final_matches.values())) da = sorted(da, key=lambda ma: ma.scores['relevance'].value, reverse=True) d = Document(matches=da[: int(parameters['top_k'])]) result_da.append(d) return result_da
def test_lazy_serialization(): doc_count = 1000 r = DataRequest() da = r.docs da.extend([Document(text='534534534er5yr5y645745675675675345')] * doc_count) r.data.docs = da byte_array = DataRequestProto.SerializeToString(r) deserialized_request = DataRequestProto.FromString(byte_array) assert not deserialized_request.is_decompressed assert len(deserialized_request.docs) == doc_count assert deserialized_request.docs == r.docs assert deserialized_request.is_decompressed