Beispiel #1
0
class MyIndexer(Executor):
    """Simple indexer class"""

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.table_name = 'qabot_docs'
        self._docs = DocumentArray(
            storage='sqlite',
            config={
                'connection': os.path.join(self.workspace, 'indexer.db'),
                'table_name': self.table_name,
            },
        )

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        self._docs.extend(docs)

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', **kwargs):
        """Append best matches to each document in docs

        :param docs: documents that are searched
        :param parameters: dictionary of pairs (parameter,value)
        :param kwargs: other keyword arguments
        """
        docs.match(
            self._docs,
            metric='cosine',
            normalization=(1, 0),
            limit=1,
        )
Beispiel #2
0
def test_document_processed_total(port_generator, executor):
    port0 = port_generator()
    port1 = port_generator()

    with Flow(monitoring=True,
              port_monitoring=port0).add(uses=executor,
                                         port_monitoring=port1) as f:

        resp = req.get(f'http://localhost:{port1}/')
        assert resp.status_code == 200

        f.post(
            f'/foo',
            inputs=DocumentArray.empty(size=4))  # process 4 documents on foo

        resp = req.get(f'http://localhost:{port1}/')
        assert (
            f'jina_document_processed_total{{executor="DummyExecutor",executor_endpoint="/foo",runtime_name="executor0/rep-0"}} 4.0'  # check that we count 4 documents on foo
            in str(resp.content))

        assert not (
            f'jina_document_processed_total{{executor="DummyExecutor",executor_endpoint="/bar",runtime_name="executor0/rep-0"}}'  # check that we does not start counting documents on bar as it has not been called yet
            in str(resp.content))

        f.post(
            f'/bar',
            inputs=DocumentArray.empty(size=5))  # process 5 documents on bar

        assert not (
            f'jina_document_processed_total{{executor="DummyExecutor",executor_endpoint="/bar",runtime_name="executor0/rep-0"}} 5.0'  # check that we count 5 documents on foo
            in str(resp.content))

        assert (
            f'jina_document_processed_total{{executor="DummyExecutor",executor_endpoint="/foo",runtime_name="executor0/rep-0"}} 4.0'  # check that we nothing change on bar count
            in str(resp.content))
Beispiel #3
0
        def foo(self, docs: DocumentArray, **kwargs):
            def bar(d: Document):
                d.text = 'hello'
                return d

            docs.apply(bar)
            return docs
Beispiel #4
0
def test_requests_size(port_generator, executor):
    port0 = port_generator()
    port1 = port_generator()

    with Flow(monitoring=True,
              port_monitoring=port0).add(uses=executor,
                                         port_monitoring=port1) as f:

        f.post('/foo', inputs=DocumentArray.empty(size=1))

        resp = req.get(f'http://localhost:{port1}/')  # enable on port0
        assert resp.status_code == 200

        assert (
            f'jina_request_size_bytes_count{{executor="DummyExecutor",executor_endpoint="/foo",runtime_name="executor0/rep-0"}} 1.0'
            in str(resp.content))

        def _get_request_bytes_size():
            resp = req.get(f'http://localhost:{port1}/')  # enable on port0

            resp_lines = str(resp.content).split('\\n')
            byte_line = [
                line for line in resp_lines if
                'jina_request_size_bytes_sum{executor="DummyExecutor"' in line
            ]

            return float(byte_line[0][-5:])

        measured_request_bytes_sum_init = _get_request_bytes_size()
        f.post('/foo', inputs=DocumentArray.empty(size=1))
        measured_request_bytes_sum = _get_request_bytes_size()

        assert measured_request_bytes_sum > measured_request_bytes_sum_init
Beispiel #5
0
class KeyValueIndexer(Executor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if os.path.exists(self.workspace + '/kv-idx'):
            self._docs = DocumentArray.load(self.workspace + '/kv-idx')
        else:
            self._docs = DocumentArray()

    @requests(on='/index')
    def index(self, docs: DocumentArray, **kwargs):
        self._docs.extend(docs)

    @requests(on='/search')
    def query(self, docs: DocumentArray, **kwargs):
        for doc in docs:
            new_matches = DocumentArray()
            for match in doc.matches:
                extracted_doc = self._docs[match.parent_id]
                extracted_doc.scores = match.scores
                new_matches.append(extracted_doc)
            doc.matches = new_matches

    def close(self):
        """
        Stores the DocumentArray to disk
        """
        self._docs.save(self.workspace + '/kv-idx')
Beispiel #6
0
class MyIndexer(Executor):
    """Simple indexer class """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        if os.path.exists(self.workspace + '/indexer'):
            self._docs = DocumentArray.load(self.workspace + '/indexer')
        else:
            self._docs = DocumentArray()

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        self._docs.extend(docs)

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', **kwargs):
        """Append best matches to each document in docs

        :param docs: documents that are searched
        :param parameters: dictionary of pairs (parameter,value)
        :param kwargs: other keyword arguments
        """
        docs.match(
            self._docs,
            metric='cosine',
            normalization=(1, 0),
            limit=1,
        )

    def close(self):
        """
        Stores the DocumentArray to disk
        """
        self._docs.save(self.workspace + '/indexer')
Beispiel #7
0
 def __init__(self, index_file_name: str, **kwargs):
     super().__init__(**kwargs)
     self._index_file_name = index_file_name
     if os.path.exists(self.workspace + f'/{index_file_name}'):
         self._docs = DocumentArray.load(self.workspace + f'/{index_file_name}')
     else:
         self._docs = DocumentArray()
Beispiel #8
0
class DocVectorIndexer(Executor):
    def __init__(self, index_file_name: str, **kwargs):
        super().__init__(**kwargs)
        self._index_file_name = index_file_name
        if os.path.exists(self.workspace + f'/{index_file_name}'):
            self._docs = DocumentArray.load(self.workspace + f'/{index_file_name}')
        else:
            self._docs = DocumentArray()

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        self._docs.extend(docs)

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs):
        docs.match(
            self._docs,
            metric='cosine',
            normalization=(1, 0),
            limit=int(parameters['top_k']),
        )

    def close(self):
        """
        Stores the DocumentArray to disk
        """
        self._docs.save(self.workspace + f'/{self._index_file_name}')
Beispiel #9
0
    async def aencode(self, content, **kwargs):
        from docarray import DocumentArray

        r = DocumentArray()
        async for da in self._async_client.post(
                **self._get_post_payload(content, kwargs)):
            r.extend(da)
        return r.embeddings if self._return_plain else r
Beispiel #10
0
 def query(self, docs: DocumentArray, **kwargs):
     for doc in docs:
         new_matches = DocumentArray()
         for match in doc.matches:
             extracted_doc = self._docs[match.parent_id]
             extracted_doc.scores = match.scores
             new_matches.append(extracted_doc)
         doc.matches = new_matches
Beispiel #11
0
 def __init__(self, **kwargs):
     super().__init__(**kwargs)
     self.table_name = 'qabot_docs'
     self._docs = DocumentArray(
         storage='sqlite',
         config={
             'connection': os.path.join(self.workspace, 'indexer.db'),
             'table_name': self.table_name,
         },
     )
Beispiel #12
0
 def craft(self, docs: DocumentArray, **kwargs):
     filtered_docs = DocumentArray(
         d for d in docs.traverse_flat('c') if d.mime_type == 'image/jpeg'
     )
     target_size = 224
     for doc in filtered_docs:
         doc.load_uri_to_image_tensor()
         doc.set_image_tensor_shape(shape=(target_size, target_size))
         doc.set_image_tensor_channel_axis(-1, 0)
     return filtered_docs
Beispiel #13
0
        def docs(self) -> 'DocumentArray':
            """Get the :class: `DocumentArray` with sequence `data.docs` as content.

            .. # noqa: DAR201"""
            if not self._loaded_doc_array:
                if self._content.WhichOneof('documents') == 'docs_bytes':
                    self._loaded_doc_array = DocumentArray.from_bytes(
                        self._content.docs_bytes)
                else:
                    self._loaded_doc_array = DocumentArray.from_protobuf(
                        self._content.docs)

            return self._loaded_doc_array
Beispiel #14
0
    async def dry_run(self, empty, context) -> jina_pb2.StatusProto:
        """
        Process the the call requested by having a dry run call to every Executor in the graph

        :param empty: The service expects an empty protobuf message
        :param context: grpc context
        :returns: the response request
        """
        from docarray import DocumentArray
        from jina.clients.request import request_generator
        from jina.enums import DataInputType
        from jina.serve.executors import __dry_run_endpoint__

        da = DocumentArray()

        try:
            req_iterator = request_generator(
                exec_endpoint=__dry_run_endpoint__,
                data=da,
                data_type=DataInputType.DOCUMENT,
            )
            async for _ in self.streamer.stream(request_iterator=req_iterator):
                pass
            status_message = StatusMessage()
            status_message.set_code(jina_pb2.StatusProto.SUCCESS)
            return status_message.proto
        except Exception as ex:
            status_message = StatusMessage()
            status_message.set_exception(ex)
            return status_message.proto
Beispiel #15
0
 def encode(self, docs: DocumentArray, **kwargs):
     with torch.inference_mode():
         _input = torch.from_numpy(docs.tensors.astype('float32'))
         _features = self._get_features(_input).detach()
         _features = _features.numpy()
         _features = self._get_pooling(_features)
         docs.embeddings = _features
Beispiel #16
0
def test_decorator_interface(port_generator):
    class DummyExecutor(Executor):
        @requests(on='/foo')
        def foo(self, docs, **kwargs):
            self._proces(docs)
            self.proces_2(docs)

        @monitor(name='metrics_name', documentation='metrics description')
        def _proces(self, docs):
            ...

        @monitor()
        def proces_2(self, docs):
            ...

    port = port_generator()
    with Flow(monitoring=True,
              port_monitoring=port_generator()).add(uses=DummyExecutor,
                                                    monitoring=True,
                                                    port_monitoring=port) as f:
        f.post('/foo', inputs=DocumentArray.empty(4))

        resp = req.get(f'http://localhost:{port}/')
        assert f'jina_metrics_name_count{{runtime_name="executor0/rep-0"}} 1.0' in str(
            resp.content)
        assert (
            f'jina_proces_2_seconds_count{{runtime_name="executor0/rep-0"}} 1.0'
            in str(resp.content))
Beispiel #17
0
def test_app_models_acceptance(docs_input):
    f = Flow(protocol='http').add()

    with f:
        r = req.post(f'http://localhost:{f.port}/index', json=docs_input)

    assert DocumentArray.from_dict(r.json()['data'])[0].text == 'text_input'
Beispiel #18
0
def test_conditions_filtering(tmpdir, flow):
    with flow:
        ret = flow.post(
            on='index',
            inputs=DocumentArray([
                Document(text='type1', tags={'type': 1}),
                Document(text='type2', tags={'type': 2}),
            ]),
        )
        assert len(ret) == 2
        types_set = set()
        for doc in ret:
            if doc.tags['type'] == 1:
                assert doc.text == 'type1 processed by exec1'
            else:
                assert doc.tags['type'] == 2
                assert doc.text == 'type2 processed by exec2'
            types_set.add(doc.tags['type'])

        assert types_set == {1, 2}

    with open(os.path.join(str(tmpdir), 'exec1', '0', f'exec1.txt'),
              'r') as fp:
        assert fp.read() == 'type1'

    with open(os.path.join(str(tmpdir), 'exec2', '0', f'exec2.txt'),
              'r') as fp:
        assert fp.read() == 'type2'
Beispiel #19
0
def test_client_on_error_call(protocol, exception):

    with pytest.raises(exception):
        Client(host='0.0.0.0', protocol=protocol, port=12345).post(
            '/blah',
            inputs=DocumentArray.empty(10),
        )
Beispiel #20
0
def test_executor_load_from_hub():
    exec = Executor.from_hub('jinahub://DummyHubExecutor',
                             uses_metas={'name': 'hello123'})
    da = DocumentArray([Document()])
    exec.foo(da)
    assert da.texts == ['hello']
    assert exec.metas.name == 'hello123'
Beispiel #21
0
def test_client_host_scheme(protocol):
    port = random_port()
    f = Flow(protocol='websocket' if protocol == 'ws' else protocol,
             port=port).add()
    with f:
        c = Client(host=f'{protocol}://localhost:{port}')
        c.post('/', inputs=DocumentArray.empty(2))
Beispiel #22
0
def test_healthcheck_logs_websocket_with_env(capfd, health_check_env):
    f = Flow(protocol='websocket', port=12345).add()
    with f:
        f.post('/', inputs=DocumentArray.empty())
        req.get('http://localhost:12345/')

    out, _ = capfd.readouterr()
    assert '"GET / HTTP/1.1" 200 OK' not in out
Beispiel #23
0
def test_grpc_compression(compression_client, compression_gateway):
    with Flow(grpc_compression=compression_gateway).add().add() as f:
        ret = f.post(
            on='/',
            inputs=DocumentArray([Document()]),
            grpc_compression=compression_client,
        )
    assert len(ret) == 1
Beispiel #24
0
        def docs(self, value: DocumentArray):
            """Overide the DocumentArray with the provided one

            :param value: a DocumentArray
            """
            if value:
                self._loaded_doc_array = None
                self._content.docs.CopyFrom(value.to_protobuf())
Beispiel #25
0
 def segment(self, docs: DocumentArray, **kwargs):
     for doc in docs:
         text = doc.tags['caption']
         uri = f'{os.environ["HW_WORKDIR"]}/people-img/{doc.tags["image"]}'
         chunk_text = Document(text=text, mime_type='text/plain')
         chunk_uri = Document(uri=uri, mime_type='image/jpeg')
         doc.chunks = DocumentArray([chunk_text, chunk_uri])
         doc.uri = uri
         doc.convert_uri_to_datauri()
Beispiel #26
0
def hello_world(args):
    """
    Execute the chatbot example.

    :param args: arguments passed from CLI
    """
    Path(args.workdir).mkdir(parents=True, exist_ok=True)

    with ImportExtensions(
            required=True,
            help_text=
            'this demo requires Pytorch and Transformers to be installed, '
            'if you haven\'t, please do `pip install jina[torch,transformers]`',
    ):
        import torch
        import transformers

        assert [torch,
                transformers]  #: prevent pycharm auto remove the above line

    targets = {
        'covid-csv': {
            'url': args.index_data_url,
            'filename': os.path.join(args.workdir, 'dataset.csv'),
        }
    }

    # download the data
    download_data(targets, args.download_proxy, task_name='download csv data')

    # now comes the real work
    # load index flow from a YAML file

    f = _get_flow(args)

    # index it!
    with f:
        f.index(
            DocumentArray.from_csv(targets['covid-csv']['filename'],
                                   field_resolver={'question': 'text'}),
            show_progress=True,
        )

        url_html_path = 'file://' + os.path.abspath(
            os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         'static/index.html'))
        try:
            webbrowser.open(url_html_path, new=2)
        except:
            pass  # intentional pass, browser support isn't cross-platform
        finally:
            default_logger.info(
                f'You should see a demo page opened in your browser, '
                f'if not, you may open {url_html_path} manually')

        if not args.unblock_query_flow:
            f.block()
Beispiel #27
0
            def _sort_response_docs(response):
                # sort response docs according to their order in the initial request
                def sort_by_request_order(doc):
                    if doc.id in request_doc_ids:
                        return request_doc_ids.index(doc.id)
                    else:
                        return len(request_doc_ids)  # put new/unknown docs at the end

                sorted_docs = sorted(response.data.docs, key=sort_by_request_order)
                response.data.docs = DocumentArray(sorted_docs)
Beispiel #28
0
def test_empty_arrays(linear_flow):
    docs = DocumentArray.empty(5)

    with linear_flow as f:
        resp = f.post(on='/foo', inputs=docs)
    for doc in resp:
        assert not doc.tags['listcheck_embedding']
        assert not doc.tags['listcheck_tensor']
        assert not doc.tags['nparraycheck_embedding']
        assert not doc.tags['nparraycheck_tensor']
Beispiel #29
0
        def set_docs_convert_arrays(self,
                                    value: DocumentArray,
                                    ndarray_type: Optional[str] = None):
            """ " Convert embedding and tensor to given type, then set DocumentArray

            :param value: a DocumentArray
            :param ndarray_type: type embedding and tensor will be converted to
            """
            if value is not None:
                self._loaded_doc_array = None
                self._content.docs.CopyFrom(
                    value.to_protobuf(ndarray_type=ndarray_type))
Beispiel #30
0
async def test_async_apply():
    class AsyncExecutor(Executor):
        @requests
        async def foo(self, docs: DocumentArray, **kwargs):
            docs.apply(set_hello)
            return docs

    N = 2
    da = DocumentArray.empty(N)
    exec = AsyncExecutor()
    da1 = await exec.foo(da)
    assert da1.texts == ['hello'] * N