Exemple #1
0
    def _iter_doc(self, content) -> Generator['Document', None, None]:
        from docarray import Document

        self._return_plain = True

        for c in content:
            if isinstance(c, str):
                self._return_plain = True
                _mime = mimetypes.guess_type(c)[0]
                if _mime and _mime.startswith('image'):
                    yield Document(uri=c).load_uri_to_blob()
                else:
                    yield Document(text=c)
            elif isinstance(c, Document):
                if c.content_type in ('text', 'blob'):
                    self._return_plain = False
                    yield c
                elif not c.blob and c.uri:
                    c.load_uri_to_blob()
                    self._return_plain = False
                    yield c
                else:
                    raise TypeError(
                        f'unsupported input type {c!r} {c.content_type}')
            else:
                raise TypeError(f'unsupported input type {c!r}')
Exemple #2
0
def test_conditions_filtering(tmpdir, flow):
    with flow:
        ret = flow.post(
            on='index',
            inputs=DocumentArray([
                Document(text='type1', tags={'type': 1}),
                Document(text='type2', tags={'type': 2}),
            ]),
        )
        assert len(ret) == 2
        types_set = set()
        for doc in ret:
            if doc.tags['type'] == 1:
                assert doc.text == 'type1 processed by exec1'
            else:
                assert doc.tags['type'] == 2
                assert doc.text == 'type2 processed by exec2'
            types_set.add(doc.tags['type'])

        assert types_set == {1, 2}

    with open(os.path.join(str(tmpdir), 'exec1', '0', f'exec1.txt'),
              'r') as fp:
        assert fp.read() == 'type1'

    with open(os.path.join(str(tmpdir), 'exec2', '0', f'exec2.txt'),
              'r') as fp:
        assert fp.read() == 'type2'
Exemple #3
0
 def segment(self, docs: DocumentArray, **kwargs):
     for doc in docs:
         text = doc.tags['caption']
         uri = f'{os.environ["HW_WORKDIR"]}/people-img/{doc.tags["image"]}'
         chunk_text = Document(text=text, mime_type='text/plain')
         chunk_uri = Document(uri=uri, mime_type='image/jpeg')
         doc.chunks = DocumentArray([chunk_text, chunk_uri])
         doc.uri = uri
         doc.convert_uri_to_datauri()
Exemple #4
0
def test_set_workspace(tmpdir):
    complete_workspace = os.path.abspath(
        os.path.join(tmpdir, 'WorkspaceExec', '0'))
    with Flow().add(uses=WorkspaceExec, workspace=str(tmpdir)) as f:
        resp = f.post(on='/foo', inputs=Document())
    assert resp[0].text == complete_workspace
    with Flow().add(uses=WorkspaceExec, uses_metas={'workspace':
                                                    str(tmpdir)}) as f:
        resp = f.post(on='/foo', inputs=Document())
    assert resp[0].text == complete_workspace
Exemple #5
0
def documents(start_index, end_index):
    for i in range(start_index, end_index):
        doc = Document()
        doc.text = 'this is text'
        doc.tags['id'] = 'id in tags'
        doc.tags['inner_dict'] = {'id': 'id in inner_dict'}
        chunk = Document()
        chunk.text = 'text in chunk'
        chunk.tags['id'] = 'id in chunk tags'
        doc.chunks.append(chunk)
        yield doc
Exemple #6
0
def test_flow_default_polling_endpoints(polling):
    f = Flow().add(uses=DynamicPollingExecutorDefaultNames,
                   shards=2,
                   polling=polling)

    with f:
        docs_index = f.post(on='/index', inputs=[Document(text='1')])
        docs_search = f.post(on='/search', inputs=[Document(text='1')])
        docs_custom = f.post(on='/custom', inputs=[Document(text='1')])
    assert len(docs_index) == 2
    assert len(docs_search) == 3
    assert len(docs_custom) == 3 if polling == 'all' else 2
Exemple #7
0
def test_reducer_executor(n_shards, n_matches, n_chunks):
    reducer_executor = ReducerExecutor()
    query = DocumentArray([Document() for _ in range(5)])
    docs_matrix = [deepcopy(query) for _ in range(n_shards)]
    for da in docs_matrix:
        for doc in da:
            doc.matches.extend([Document() for _ in range(n_matches)])
            doc.chunks.extend([Document() for _ in range(n_chunks)])

    reduced_da = reducer_executor.reduce(docs_matrix=docs_matrix)
    for doc in reduced_da:
        assert len(doc.matches) == n_shards * n_matches
        assert len(doc.chunks) == n_shards * n_chunks
Exemple #8
0
def test_tag_update():
    port = random_port()

    f = Flow(port=port, protocol='http').add(uses=TestExecutor)
    d1 = Document(id='1', prop1='val')
    d2 = Document(id='2', prop2='val')
    with f:
        d1 = {'data': [d1.to_dict()]}
        d2 = {'data': [d2.to_dict()]}
        r1 = req.post(f'http://localhost:{port}/index', json=d1)
        r2 = req.post(f'http://localhost:{port}/index', json=d2)
    assert r1.json()['data'][0]['tags'] == {'prop1': 'val'}
    assert r2.json()['data'][0]['tags'] == {'prop2': 'val'}
Exemple #9
0
def test_flow_default_custom_polling_endpoints(polling):
    custom_polling_config = {'/custom': 'ALL', '/search': 'ANY', '*': polling}
    f = Flow().add(
        uses=DynamicPollingExecutorDefaultNames,
        shards=2,
        polling=custom_polling_config,
    )

    with f:
        docs_index = f.post(on='/index', inputs=[Document(text='1')])
        docs_search = f.post(on='/search', inputs=[Document(text='1')])
        docs_custom = f.post(on='/custom', inputs=[Document(text='1')])
    assert len(docs_index) == 2
    assert len(docs_search) == 2
    assert len(docs_custom) == 3
Exemple #10
0
def test_volumes_in_flow(tmpdir, source, destination, workspace,
                         filewriter_exec_docker_image_built):
    with mock.patch.dict(
            os.environ,
        {'JINA_DEFAULT_WORKSPACE_BASE': str(os.path.join(tmpdir, 'default'))},
    ):
        if source:  # test manually set volume and workspace
            source = os.path.join(tmpdir, source)
            volumes = [str(source) + ':' + destination]
        else:  # test auto volume and workspace
            volumes = None
            source = os.path.join(tmpdir, 'default')

        f = Flow().add(uses='docker://filewriter-exec',
                       volumes=volumes,
                       workspace=workspace)
        with f:
            f.post(inputs=[Document()], on='/foo')

        assert os.path.exists(source)

        found_output_file = False  # workspace has random element, so we search for it
        for cur_path, dirs, files in os.walk(source):
            if 'out.txt' in files:
                with open(os.path.join(cur_path, 'out.txt'), 'r') as f:
                    if f.read() == 'Filewriter was here':
                        found_output_file = True
        assert found_output_file
Exemple #11
0
 def random_docs(num_docs):
     for j in range(1, num_docs + 1):
         doc = Document()
         doc.text = f'i\'m dummy doc {j}'
         doc.offset = 1000
         doc.tags['id'] = 1000  # this will be ignored
         yield doc
Exemple #12
0
def test_complex_flow(disable_reduce):
    f = (Flow().add(name='first', uses=SimpleAddExecutor,
                    needs=['gateway']).add(name='forth',
                                           uses=SimpleAddExecutor,
                                           needs=['first'],
                                           shards=2).add(
                                               name='second_shards_needs',
                                               uses=SimpleAddExecutor,
                                               needs=['gateway'],
                                               shards=2,
                                           ).add(
                                               name='third',
                                               uses=SimpleAddExecutor,
                                               shards=3,
                                               needs=['second_shards_needs'],
                                           ).add(
                                               name='merger',
                                               uses=MergeDocsExecutor,
                                               needs=['forth', 'third'],
                                               disable_reduce=disable_reduce,
                                           ))

    with f:
        docs = f.post(on='/index', inputs=[Document(text='1')])
    assert len(docs) == 6 if disable_reduce else 5
Exemple #13
0
def test_executor_load_from_hub():
    exec = Executor.from_hub('jinahub://DummyHubExecutor',
                             uses_metas={'name': 'hello123'})
    da = DocumentArray([Document()])
    exec.foo(da)
    assert da.texts == ['hello']
    assert exec.metas.name == 'hello123'
Exemple #14
0
def _new_doc_from_data(data, data_type: DataInputType,
                       **kwargs) -> Tuple['Document', 'DataInputType']:
    def _build_doc_from_content():
        return Document(content=data, **kwargs), DataInputType.CONTENT

    if data_type == DataInputType.DICT:
        doc = Document.from_dict(data)
        return doc, DataInputType.DICT
    if data_type == DataInputType.AUTO or data_type == DataInputType.DOCUMENT:
        if isinstance(data, Document):
            # if incoming is already primitive type Document, then all good, best practice!
            return data, DataInputType.DOCUMENT
        elif isinstance(data, dict):
            return Document.from_dict(data), DataInputType.DICT
        try:
            d = Document(data, **kwargs)
            return d, DataInputType.DOCUMENT
        except ValueError:
            # AUTO has a fallback, now reconsider it as content
            if data_type == DataInputType.AUTO:
                return _build_doc_from_content()
            else:
                raise
    elif data_type == DataInputType.CONTENT:
        return _build_doc_from_content()
Exemple #15
0
def test_grpc_compression(compression_client, compression_gateway):
    with Flow(grpc_compression=compression_gateway).add().add() as f:
        ret = f.post(
            on='/',
            inputs=DocumentArray([Document()]),
            grpc_compression=compression_client,
        )
    assert len(ret) == 1
Exemple #16
0
def test_shards():
    f = Flow().add(uses=SimpleAddExecutor, shards=2)

    with f:
        docs = f.post(on='/index',
                      inputs=[Document(text='1')],
                      return_results=True)
        assert len(docs) == 2
Exemple #17
0
def test_data_type_builder_doc(builder, input_data_type, output_data_type):
    a = Document()
    a.id = 'a236cbb0eda62d58'
    a.text = 'text test'
    d, t = _new_doc_from_data(builder(a), input_data_type)
    if input_data_type != DataInputType.CONTENT:
        assert d.id == a.id
    assert d.text == a.text
    assert t == output_data_type
Exemple #18
0
def test_status():
    r = DataRequest()
    r.docs.extend([Document()])
    r.add_exception(ValueError('intentional_error'))
    byte_array = DataRequestProto.SerializeToString(r)

    deserialized_request = DataRequestProto.FromString(byte_array)
    assert not deserialized_request.is_decompressed
    assert deserialized_request.status.code == jina_pb2.StatusProto.ERROR
    assert deserialized_request.is_decompressed
Exemple #19
0
def test_conditions_filtering_on_joiner(tmpdir):
    flow = (Flow().add(name='first').add(
        uses=ConditionDumpExecutor,
        uses_metas={
            'name': 'joiner_test_exec1'
        },
        workspace=str(tmpdir),
        name='joiner_test_exec1',
        needs=['first'],
    ).add(
        uses=ConditionDumpExecutor,
        workspace=str(tmpdir),
        uses_metas={
            'name': 'joiner_test_exec2'
        },
        name='joiner_test_exec2',
        needs='first',
    ).needs_all('joiner', when={'tags__type': {
        '$eq': 3
    }}))
    with flow:
        ret = flow.post(
            on='index',
            inputs=DocumentArray([
                Document(text='type1', tags={'type': 1}),
                Document(text='type2', tags={'type': 2}),
            ]),
        )
        assert len(ret) == 0

    with open(
            os.path.join(str(tmpdir), 'joiner_test_exec1', '0',
                         f'joiner_test_exec1.txt'),
            'r',
    ) as fp:
        assert fp.read() == 'type1type2'

    with open(
            os.path.join(str(tmpdir), 'joiner_test_exec2', '0',
                         f'joiner_test_exec2.txt'),
            'r',
    ) as fp:
        assert fp.read() == 'type1type2'
Exemple #20
0
def test_blob_transmission(decode, protocol):
    decode = False
    f = Flow(protocol=protocol).add(uses=MyExec)
    with f:
        c = Client(port=f.port, protocol=protocol)
        d = c.post('/', Document(blob=b'hello'), parameters={'decode': decode})[0]
    if decode:  # test that the Executor gets the correct data
        assert d.text == 'hello'
    else:  # test that the response contains the correct data
        assert d.blob == b'hello'
Exemple #21
0
def test_grpc_ssl_with_flow(cert_pem, key_pem, error_log_level):
    with Flow(
            protocol='grpc',
            ssl_certfile=cert_pem,
            ssl_keyfile=key_pem,
    ) as f:

        with pytest.raises(grpc.aio._call.AioRpcError):
            Client(protocol='grpc', port=f.port, tls=True).index([Document()])
    # the openssl error from above seems to take a bit to actually terminate and may cause the next test to seg fault
    time.sleep(1.0)
Exemple #22
0
def test_uvicorn_ssl_with_flow(cert_pem, key_pem, protocol, capsys, error_log_level):
    with Flow(
        protocol=protocol,
        uvicorn_kwargs=[
            'ssl_keyfile_password: abcd',
        ],
        ssl_certfile=cert_pem,
        ssl_keyfile=key_pem,
    ) as f:

        with pytest.raises(aiohttp.ClientConnectorCertificateError):
            Client(protocol=protocol, port=f.port, tls=True).index([Document()])
Exemple #23
0
def test_chained_conditions(tmpdir, temp_workspace):
    f = (Flow().add(name='first').add(
        uses=ConditionDumpExecutor,
        uses_metas={
            'name': 'exec1'
        },
        workspace=os.environ['TEMP_WORKSPACE'],
        name='exec1',
        needs=['first'],
        when={
            'tags__type': {
                '$gte': 2
            }
        },
    ).add(
        uses=ConditionDumpExecutor,
        workspace=os.environ['TEMP_WORKSPACE'],
        uses_metas={
            'name': 'exec2'
        },
        name='exec2',
        needs='exec1',
        when={
            'tags__type': {
                '$lte': 1
            }
        },
    ).needs_all('joiner'))

    with f:
        ret = f.post(
            on='index',
            inputs=DocumentArray([
                Document(text='type1', tags={'type': 1}),
                Document(text='type2', tags={'type': 2}),
                Document(text='type2', tags={'type': 3}),
            ]),
        )
        assert len(ret) == 0
Exemple #24
0
def test_expected_messages_routing():
    f = (Flow().add(name='foo',
                    uses=SimplExecutor).add(name='bar',
                                            uses=MergeExecutor,
                                            needs=['foo', 'gateway']))

    with f:
        docs = f.post(on='/index',
                      inputs=[Document(text='1')],
                      return_results=True)
        # there merge executor actually does not merge despite its name
        assert len(docs) == 2
        assert docs[0].text == 'merged'
Exemple #25
0
async def test_aync_data_request_handler_new_docs(logger):
    args = set_pod_parser().parse_args(['--uses', 'AsyncNewDocsExecutor'])
    handler = DataRequestHandler(args, logger)
    req = list(
        request_generator(
            '/',
            DocumentArray([Document(text='input document')
                           for _ in range(10)])))[0]
    assert len(req.docs) == 10
    response = await handler.handle(requests=[req])

    assert len(response.docs) == 1
    assert response.docs[0].text == 'new document'
Exemple #26
0
def test_expected_messages_routing(disable_reduce):
    f = (Flow().add(name='foo', uses=SimplExecutor).add(
        name='bar',
        uses=MergeExecutor,
        needs=['foo', 'gateway'],
        disable_reduce=disable_reduce,
    ))

    with f:
        docs = f.post(on='/index', inputs=[Document(text='1')])
        # there merge executor actually does not merge despite its name
        assert len(docs) == 2 if disable_reduce else 1
        assert docs[0].text == 'merged' if disable_reduce else '1'
Exemple #27
0
    def rank(
        self, docs_matrix: List['DocumentArray'], parameters: Dict, **kwargs
    ) -> 'DocumentArray':
        """
        :param docs_matrix: list of :class:`DocumentArray` on multiple requests to
          get bubbled up matches.
        :param parameters: the parameters passed into the ranker, in this case stores :attr`top_k`
          to filter k results based on score.
        :param kwargs: not used (kept to maintain interface)
        """

        result_da = DocumentArray()  # length: 1 as every time there is only one query
        for d_mod1, d_mod2 in zip(*docs_matrix):

            final_matches = {}  # type: Dict[str, Document]
            for m in d_mod1.matches:
                relevance_score = m.scores['cosine'].value * d_mod1.weight
                m.scores['relevance'].value = relevance_score
                final_matches[m.parent_id] = Document(m, copy=True)

            for m in d_mod2.matches:
                if m.parent_id in final_matches:
                    final_matches[m.parent_id].scores[
                        'relevance'
                    ].value = final_matches[m.parent_id].scores['relevance'].value + (
                        m.scores['cosine'].value * d_mod2.weight
                    )
                else:
                    m.scores['relevance'].value = (
                        m.scores['cosine'].value * d_mod2.weight
                    )
                    final_matches[m.parent_id] = Document(m, copy=True)

            da = DocumentArray(list(final_matches.values()))
            da = sorted(da, key=lambda ma: ma.scores['relevance'].value, reverse=True)
            d = Document(matches=da[: int(parameters['top_k'])])
            result_da.append(d)
        return result_da
Exemple #28
0
def test_lazy_serialization():
    doc_count = 1000
    r = DataRequest()
    da = r.docs
    da.extend([Document(text='534534534er5yr5y645745675675675345')] *
              doc_count)
    r.data.docs = da
    byte_array = DataRequestProto.SerializeToString(r)

    deserialized_request = DataRequestProto.FromString(byte_array)
    assert not deserialized_request.is_decompressed
    assert len(deserialized_request.docs) == doc_count
    assert deserialized_request.docs == r.docs
    assert deserialized_request.is_decompressed
Exemple #29
0
def get_groundtruths(target, pseudo_match=False):
    # group doc_ids by their labels
    a = np.squeeze(target['index-labels']['data'])
    a = np.stack([a, np.arange(len(a))], axis=1)
    a = a[a[:, 0].argsort()]
    lbl_group = np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])

    # each label has one groundtruth, i.e. all docs that have the same label are considered as matches
    groundtruths = {lbl: Document(tags={'id': -1}) for lbl in range(10)}
    for lbl, doc_ids in enumerate(lbl_group):
        if not pseudo_match:
            # full-match, each doc has 6K matches
            for doc_id in doc_ids:
                match = Document()
                match.tags['id'] = int(doc_id)
                groundtruths[lbl].matches.append(match)
        else:
            # pseudo-match, each doc has only one match, but this match's id is a list of 6k elements
            match = Document()
            match.tags['id'] = doc_ids.tolist()
            groundtruths[lbl].matches.append(match)

    return groundtruths
Exemple #30
0
def test_uvicorn_ssl_with_flow(cert_pem, key_pem, protocol, capsys):
    with Flow(
        protocol=protocol,
        uvicorn_kwargs=[
            f'ssl_certfile: {cert_pem}',
            f'ssl_keyfile: {key_pem}',
            'ssl_keyfile_password: abcd',
        ],
    ) as f:
        os.environ['JINA_LOG_LEVEL'] = 'ERROR'
        Client(protocol=protocol, port=f.port_expose, https=True).index([Document()])
        assert (
            '''certificate verify failed: self signed certificate'''
            in capsys.readouterr().out
        )