Exemple #1
0
def test_compression(compress_algo, low_bytes, high_ratio):
    no_comp_sizes = []
    sizes = []
    docs = list(random_docs(100, embed_dim=100))
    kwargs = dict(identity='gateway',
                  pod_name='123',
                  compress_min_bytes=2 *
                  sum(no_comp_sizes) if low_bytes else 0,
                  compress_min_ratio=10 if high_ratio else 1)

    with TimeContext(f'no compress'):
        for r in _generate(docs):
            m = Message(None, r, compress=CompressAlgo.NONE, **kwargs)
            m.dump()
            no_comp_sizes.append(m.size)

    kwargs = dict(identity='gateway',
                  pod_name='123',
                  compress_min_bytes=2 *
                  sum(no_comp_sizes) if low_bytes else 0,
                  compress_min_ratio=10 if high_ratio else 1)
    with TimeContext(f'compressing with {str(compress_algo)}') as tc:
        for r in _generate(docs):
            m = Message(None, r, compress=compress_algo, **kwargs)
            m.dump()
            sizes.append(m.size)

    if compress_algo == CompressAlgo.NONE or low_bytes or high_ratio:
        assert sum(sizes) >= sum(no_comp_sizes)
    else:
        assert sum(sizes) < sum(no_comp_sizes)
    print(
        f'{str(compress_algo)}: size {sum(sizes) / len(sizes)} (ratio: {sum(no_comp_sizes) / sum(sizes):.2f}) with {tc.duration:.2f}s'
    )
Exemple #2
0
def test_request_generate_dict():
    def random_docs(num_docs):
        for j in range(1, num_docs + 1):
            doc = {
                'text': f'i\'m dummy doc {j}',
                'offset': 1000,
                'tags': {
                    'id': 1000
                },
                'chunks': [
                    {
                        'text': f'i\'m chunk 1',
                        'modality': 'text'
                    },
                    {
                        'text': f'i\'m chunk 2',
                        'modality': 'image'
                    },
                ]
            }
            yield doc

    req = _generate(data=random_docs(100), request_size=100)

    request = next(req)
    assert len(request.index.docs) == 100
    for index, doc in enumerate(request.index.docs, 1):
        assert doc.text == f'i\'m dummy doc {index}'
        assert doc.offset == 1000
        assert doc.tags['id'] == 1000
        assert len(doc.chunks) == 2
        assert doc.chunks[0].modality == 'text'
        assert doc.chunks[0].text == f'i\'m chunk 1'
        assert doc.chunks[1].modality == 'image'
        assert doc.chunks[1].text == f'i\'m chunk 2'
Exemple #3
0
def test_lazy_msg_access():
    reqs = [
        Message(None,
                r.SerializeToString(),
                'test',
                '123',
                request_id='123',
                request_type='IndexRequest')
        for r in _generate(random_docs(10))
    ]
    for r in reqs:
        assert not r.request.is_used
        assert r.envelope
        assert len(r.dump()) == 3
        assert not r.request.is_used

    for r in reqs:
        assert not r.request.is_used
        assert r.request
        assert len(r.dump()) == 3
        assert not r.request.is_used

    for r in reqs:
        assert not r.request.is_used
        assert r.request.index.docs
        assert len(r.dump()) == 3
        assert r.request.is_used
Exemple #4
0
def test_lazy_append_access():
    reqs = (Request(r.SerializeToString(), EnvelopeProto())
            for r in _generate(random_docs(10)))
    for r in reqs:
        assert not r.is_used
        # write access r.train
        r.docs.append(Document())
        # now it is read
        assert r.is_used
Exemple #5
0
def test_lazy_nested_clear_access():
    reqs = (Request(r.SerializeToString(), EnvelopeProto())
            for r in _generate(random_docs(10)))
    for r in reqs:
        assert not r.is_used
        # write access r.train
        r.index.ClearField('docs')
        # now it is read
        assert r.is_used
Exemple #6
0
def test_lazy_change_message_type():
    reqs = (Request(r.SerializeToString(), EnvelopeProto())
            for r in _generate(random_docs(10)))
    for r in reqs:
        assert not r.is_used
        # write access r.train
        r.control.command = jina_pb2.RequestProto.ControlRequestProto.IDLE
        # now it is read
        assert r.is_used
        assert len(r.index.docs) == 0
Exemple #7
0
def test_lazy_nest_access():
    reqs = (Request(r.SerializeToString(), EnvelopeProto())
            for r in _generate(random_docs(10)))
    for r in reqs:
        assert not r.is_used
        # write access r.train
        r.docs[0].id = '1' * 16
        # now it is read
        assert r.is_used
        assert r.index.docs[0].id == '1' * 16
Exemple #8
0
def test_lazy_access(field):
    reqs = (Request(r.SerializeToString(), EnvelopeProto())
            for r in _generate(random_docs(10)))
    for r in reqs:
        assert not r.is_used

        # access r.train
        print(getattr(r, field))

        # now it is read
        assert r.is_used
Exemple #9
0
def test_message_size():
    reqs = [
        Message(None, r, 'test', '123') for r in _generate(random_docs(10))
    ]
    for r in reqs:
        assert r.size == 0
        assert sys.getsizeof(r.envelope.SerializeToString())
        assert sys.getsizeof(r.request.SerializeToString())
        assert len(r.dump()) == 3
        assert r.size > sys.getsizeof(r.envelope.SerializeToString()) \
               + sys.getsizeof(r.request.SerializeToString())
Exemple #10
0
def test_request_generate_lines_from_list():
    def random_lines(num_lines):
        return [f'i\'m dummy doc {j}' for j in range(1, num_lines + 1)]

    req = _generate(data=random_lines(100), request_size=100)

    request = next(req)
    assert len(request.index.docs) == 100
    for index, doc in enumerate(request.index.docs, 1):
        assert doc.length == 100
        assert doc.mime_type == 'text/plain'
        assert doc.text == f'i\'m dummy doc {index}'
Exemple #11
0
def test_request_generate_lines_with_fake_url():
    def random_lines(num_lines):
        for j in range(1, num_lines + 1):
            yield f'https://github.com i\'m dummy doc {j}'

    req = _generate(data=random_lines(100), request_size=100)

    request = next(req)
    assert len(request.index.docs) == 100
    for index, doc in enumerate(request.index.docs, 1):
        assert doc.length == 100
        assert doc.mime_type == 'text/plain'
        assert doc.text == f'https://github.com i\'m dummy doc {index}'
Exemple #12
0
def test_request_generate_bytes():
    def random_lines(num_lines):
        for j in range(1, num_lines + 1):
            yield f'i\'m dummy doc {j}'

    req = _generate(data=random_lines(100), batch_size=100)

    request = next(req)
    assert len(request.index.docs) == 100
    for index, doc in enumerate(request.index.docs, 1):
        assert doc.length == 100
        assert doc.text == f'i\'m dummy doc {index}'
        assert doc.mime_type == 'text/plain'
Exemple #13
0
def test_multiple_access():
    reqs = [
        Request(r.SerializeToString(), EnvelopeProto())
        for r in _generate(random_docs(10))
    ]
    for r in reqs:
        assert not r.is_used
        assert r
        assert not r.is_used

    for r in reqs:
        assert not r.is_used
        assert r.index
        assert r.is_used
Exemple #14
0
def test_request_generate_numpy_arrays():
    input_array = np.random.random([10, 10])

    req = _generate(data=input_array, request_size=5)

    request = next(req)
    assert len(request.index.docs) == 5
    for index, doc in enumerate(request.index.docs, 1):
        assert doc.length == 5
        assert NdArray(doc.blob).value.shape == (10,)

    request = next(req)
    assert len(request.index.docs) == 5
    for index, doc in enumerate(request.index.docs, 1):
        assert doc.length == 5
        assert NdArray(doc.blob).value.shape == (10,)
Exemple #15
0
def test_request_generate_docs():
    def random_docs(num_docs):
        for j in range(1, num_docs + 1):
            doc = jina_pb2.DocumentProto()
            doc.text = f'i\'m dummy doc {j}'
            doc.offset = 1000
            doc.tags['id'] = 1000  # this will be ignored
            doc.mime_type = 'mime_type'
            yield doc

    req = _generate(data=random_docs(100), request_size=100)

    request = next(req)
    assert len(request.index.docs) == 100
    for index, doc in enumerate(request.index.docs, 1):
        assert doc.length == 100
        assert doc.mime_type == 'mime_type'
        assert doc.text == f'i\'m dummy doc {index}'
        assert doc.offset == 1000
Exemple #16
0
def test_request_generate_numpy_arrays_iterator():
    input_array = np.random.random([10, 10])

    def generator():
        for array in input_array:
            yield array

    req = _generate(data=generator(), batch_size=5)

    request = next(req)
    assert len(request.index.docs) == 5
    for index, doc in enumerate(request.index.docs, 1):
        assert doc.length == 5
        assert NdArray(doc.blob).value.shape == (10, )

    request = next(req)
    assert len(request.index.docs) == 5
    for index, doc in enumerate(request.index.docs, 1):
        assert doc.length == 5
        assert NdArray(doc.blob).value.shape == (10, )
Exemple #17
0
def test_lazy_request_fields():
    reqs = (Request(r.SerializeToString(), EnvelopeProto())
            for r in _generate(random_docs(10)))
    for r in reqs:
        assert list(r.DESCRIPTOR.fields_by_name.keys())