Ejemplo n.º 1
0
def test_generic():
    from jina.types.ndarray.generic import NdArray
    from scipy.sparse import coo_matrix

    row = np.array([0, 3, 1, 0])
    col = np.array([0, 3, 1, 2])
    data = np.array([4, 5, 7, 9])
    a = coo_matrix((data, (row, col)), shape=(4, 4))
    dense_a = a.toarray()

    b = NdArray(a, is_sparse=True)
    assert b.is_sparse
    dense_b = b.value.toarray()
    assert b.is_sparse
    np.testing.assert_equal(dense_b, dense_a)

    c = np.random.random([10, 3, 4])

    # without change of `is_sparse`, this should raise error
    with pytest.raises(AttributeError):
        b.value = c
    b.is_sparse = False
    b.value = c

    np.testing.assert_equal(b.value, c)
Ejemplo n.º 2
0
 def validate_response(resp):
     assert len(resp.index.docs) == NUM_DOCS
     for i, doc in enumerate(resp.index.docs):
         np.testing.assert_equal(
             NdArray(doc.blob).value, np.array([[i] * 5, [i] * 5]))
         np.testing.assert_equal(
             NdArray(doc.embedding).value, np.array([i] * 5))
Ejemplo n.º 3
0
def multimodal_all_types_documents():
    docs = []
    for idx in range(0, NUM_DOCS):
        """
        doc - idx
            |
            | - chunk - embedding [idx, idx] - modality1
            | - chunk - blob [idx, idx, idx] - modality2
            | - chunk - text 'modality3' - modality3 -> Inside multimodal encoder will be encoded into [3, 3]
            | - chunk - buffer b'modality4' - modality4 -> Inside multimodal encoder will be encoded into [4, 4]
        Result:
            doc - idx - embedding [idx, idx, idx, idx, idx, 3, 3, 4, 4]
        """
        doc = jina_pb2.DocumentProto()
        doc.text = f'{idx}'

        for modality in ['modality1', 'modality2', 'modality3', 'modality4']:
            chunk = doc.chunks.add()
            chunk.modality = modality
            if modality == 'modality1':
                NdArray(chunk.embedding).value = np.array([idx, idx])
            elif modality == 'modality2':
                NdArray(chunk.blob).value = np.array([idx, idx, idx])
            elif modality == 'modality3':
                chunk.text = 'modality3'
            elif modality == 'modality4':
                chunk.buffer = 'modality4'.encode()
        docs.append(doc)
    return docs
Ejemplo n.º 4
0
def test_segment_driver():
    valid_doc = jina_pb2.DocumentProto()
    valid_doc.id = uid.new_doc_id(valid_doc)
    valid_doc.text = 'valid'
    valid_doc.length = 2
    valid_doc.mime_type = 'image/png'

    driver = SimpleSegmentDriver()
    executor = MockSegmenter()
    driver.attach(executor=executor, pea=None)
    driver._apply_all([valid_doc])

    assert valid_doc.length == 2

    assert valid_doc.chunks[0].tags['id'] == 3
    assert valid_doc.chunks[0].parent_id == valid_doc.id
    np.testing.assert_equal(NdArray(valid_doc.chunks[0].blob).value, np.array([0.0, 0.0, 0.0]))
    assert valid_doc.chunks[0].weight == 0
    assert valid_doc.chunks[0].length == 3
    assert valid_doc.chunks[0].mime_type == 'text/plain'

    assert valid_doc.chunks[1].tags['id'] == 4
    assert valid_doc.chunks[1].parent_id == valid_doc.id
    np.testing.assert_equal(NdArray(valid_doc.chunks[1].blob).value, np.array([1.0, 1.0, 1.0]))
    assert valid_doc.chunks[1].weight == 1
    assert valid_doc.chunks[1].length == 3
    assert valid_doc.chunks[1].mime_type == 'image/png'

    assert valid_doc.chunks[2].tags['id'] == 5
    assert valid_doc.chunks[2].parent_id == valid_doc.id
    np.testing.assert_equal(NdArray(valid_doc.chunks[2].blob).value, np.array([2.0, 2.0, 2.0]))
    assert valid_doc.chunks[2].weight == 2
    assert valid_doc.chunks[2].length == 3
    assert valid_doc.chunks[2].mime_type == 'image/png'
Ejemplo n.º 5
0
 def index_documents():
     """Index Documents:
         doc: tag__id = 0
              tag__dummy_score = 0
              embedding = 0
         doc: tag__id = 1
              tag__dummy_score = -1
              embedding = 1
         doc: tag__id = 2
              tag__dummy_score = -2
              embedding = 2
     """
     doc0 = jina_pb2.DocumentProto()
     doc0.tags['id'] = '0'
     doc0.tags['dummy_score'] = 0
     NdArray(doc0.embedding).value = np.array([0])
     doc1 = jina_pb2.DocumentProto()
     doc1.tags['id'] = '1'
     doc1.tags['dummy_score'] = -1
     NdArray(doc1.embedding).value = np.array([1])
     doc2 = jina_pb2.DocumentProto()
     doc2.tags['id'] = '2'
     doc2.tags['dummy_score'] = -2
     NdArray(doc2.embedding).value = np.array([2])
     return [doc0, doc1, doc2]
Ejemplo n.º 6
0
def random_docs(num_docs,
                chunks_per_doc=5,
                embed_dim=10,
                jitter=1) -> Iterator['DocumentProto']:
    warnings.warn(
        'since 0.7.11 the introduce of Document primitive type, this '
        'fake-doc generator has been depreciated. Use "random_docs_new_api" instead',
        DeprecationWarning)
    c_id = 3 * num_docs  # avoid collision with docs
    for j in range(num_docs):
        d = jina_pb2.DocumentProto()
        d.tags['id'] = j
        d.text = b'hello world'
        NdArray(d.embedding).value = np.random.random(
            [embed_dim + np.random.randint(0, jitter)])
        d.id = uid.new_doc_id(d)
        for k in range(chunks_per_doc):
            c = d.chunks.add()
            c.text = 'i\'m chunk %d from doc %d' % (c_id, j)
            NdArray(c.embedding).value = np.random.random(
                [embed_dim + np.random.randint(0, jitter)])
            c.tags['id'] = c_id
            c.tags['parent_id'] = j
            c_id += 1
            c.parent_id = d.id
            c.id = uid.new_doc_id(c)
        yield d
    def validate_chunks_and_matches_fn(resp):
        assert len(resp.search.docs) == 1
        doc = resp.search.docs[0]
        assert int(doc.tags['id']) == 1
        assert len(doc.chunks) == 3

        chunk0 = doc.chunks[0]
        assert int(chunk0.tags['id']) == 10
        assert chunk0.text == text
        np.testing.assert_almost_equal(random_np_array, NdArray(chunk0.embedding).value)

        chunk1 = doc.chunks[1]
        assert int(chunk1.tags['id']) == 20
        np.testing.assert_almost_equal(random_np_array, NdArray(chunk1.blob).value)

        chunk2 = doc.chunks[2]
        assert int(chunk2.tags['id']) == 30
        assert chunk2.buffer == buffer

        assert len(doc.matches) == 3

        match0 = doc.matches[0]
        assert int(match0.tags['id']) == 10
        assert match0.text == text
        np.testing.assert_almost_equal(random_np_array, NdArray(match0.embedding).value)

        match1 = doc.matches[1]
        assert int(match1.tags['id']) == 20
        np.testing.assert_almost_equal(random_np_array, NdArray(match1.blob).value)

        match2 = doc.matches[2]
        assert int(match2.tags['id']) == 30
        assert match2.buffer == buffer
Ejemplo n.º 8
0
def extract_docs(docs: Iterable['jina_pb2.DocumentProto'],
                 embedding: bool) -> Tuple:
    """Iterate over a list of protobuf documents and extract chunk-level information from them

    :param docs: an iterable of protobuf documents
    :param embedding: an indicator of extracting embedding or not.
                    If ``True`` then all doc-level embedding are extracted.
                    If ``False`` then ``text``, ``buffer``, ``blob`` info of each doc are extracted
    :return: A tuple of 3 pieces:

            - a numpy ndarray of extracted info
            - the corresponding doc references
            - the doc_id list where the doc has no contents, useful for debugging
    """
    contents = []
    docs_pts = []
    bad_doc_ids = []

    if embedding:
        _extract_fn = lambda doc: NdArray(doc.embedding).value
    else:
        _extract_fn = lambda doc: doc.text or doc.buffer or NdArray(doc.blob
                                                                    ).value

    for doc in docs:
        content = _extract_fn(doc)

        if content is not None:
            contents.append(content)
            docs_pts.append(doc)
        else:
            bad_doc_ids.append((doc.id, doc.parent_id))

    contents = np.stack(contents) if contents else None
    return contents, docs_pts, bad_doc_ids
Ejemplo n.º 9
0
 def validate(req):
     mock()
     assert len(docs) == len(req.docs)
     for d, d0 in zip(req.docs, docs):
         np.testing.assert_almost_equal(
             NdArray(d.embedding).value,
             NdArray(d0.embedding).value)
Ejemplo n.º 10
0
 def validate(req):
     assert len(req.docs) == 2
     assert NdArray(req.docs[0].embedding).value.shape == (e1.shape[0] * 2,)
     assert NdArray(req.docs[1].embedding).value.shape == (e3.shape[0] * 2,)
     # assert NdArray(req.docs[0].chunks[0].embedding).value.shape == (e2.shape[0] * 2,)
     # assert NdArray(req.docs[1].chunks[0].embedding).value.shape == (e4.shape[0] * 2,)
     np.testing.assert_almost_equal(NdArray(req.docs[0].embedding).value, np.concatenate([e1, e1], axis=0),
                                    decimal=4)
Ejemplo n.º 11
0
def ground_truth_pairs():
    num_docs = 10
    pairs = []
    for idx in range(num_docs):
        doc = jina_pb2.DocumentProto()
        gt = jina_pb2.DocumentProto()
        NdArray(doc.embedding).value = np.array([1, 1])
        NdArray(gt.embedding).value = np.array([2, 2])
        pairs.append(DocGroundtruthPair(doc=doc, groundtruth=gt))
    return pairs
Ejemplo n.º 12
0
def test_array2pb():
    # i don't understand why is this set?
    # os env should be available to that process-context only
    if 'JINA_ARRAY_QUANT' in os.environ:
        print(f'quant is on: {os.environ["JINA_ARRAY_QUANT"]}')
        del os.environ['JINA_ARRAY_QUANT']

    d = NdArray()
    d.value = e4
    np.testing.assert_almost_equal(d.value, e4)
Ejemplo n.º 13
0
def test_vectorsearch_driver_mock_indexer_with_fill():
    doc = create_document_to_search()
    driver = SimpleVectorSearchDriver(top_k=2, fill_embedding=True)
    executor = MockIndexer()
    driver.attach(executor=executor, pea=None)
    driver._apply_all(doc.chunks)

    for chunk in doc.chunks:
        assert NdArray(chunk.matches[0].embedding).value.shape == (7,)
        assert NdArray(chunk.matches[-1].embedding).value.shape == (7,)
        assert NdArray(chunk.matches[-1].embedding).value is not None
Ejemplo n.º 14
0
def _extract_doc_content(doc: 'jina_pb2.DocumentProto'):
    """Returns the content of the document with the following priority:
    If the document has an embedding, return it, otherwise return its content.
    """
    r = NdArray(doc.embedding).value
    if r is not None:
        return r
    elif doc.text or doc.buffer:
        return doc.text or doc.buffer
    else:
        return NdArray(doc.blob).value
Ejemplo n.º 15
0
def test_multimodal_driver(simple_multimodal_driver, mock_multimodal_encoder,
                           doc_with_multimodal_chunks):
    simple_multimodal_driver.attach(executor=mock_multimodal_encoder, pea=None)
    simple_multimodal_driver._apply_all([doc_with_multimodal_chunks])
    doc = doc_with_multimodal_chunks
    assert len(doc.chunks) == 3
    visual1 = doc.chunks[0]
    visual2 = doc.chunks[1]
    textual = doc.chunks[2]
    assert NdArray(doc.embedding).value.shape[0] == NdArray(visual1.embedding).value.shape[0] + \
           NdArray(visual2.embedding).value.shape[0] + NdArray(textual.embedding).value.shape[0]
Ejemplo n.º 16
0
def input_fn():
    doc1 = DocumentProto()
    NdArray(doc1.embedding).value = e1
    c = doc1.chunks.add()
    NdArray(c.embedding).value = e2
    c.id = UniqueId(1)
    doc2 = DocumentProto()
    NdArray(doc2.embedding).value = e3
    d = doc2.chunks.add()
    d.id = UniqueId(2)
    NdArray(d.embedding).value = e4
    return [doc1, doc2]
Ejemplo n.º 17
0
def test_index_driver():
    docs = create_documents_to_encode(10)
    driver = SimpleFillDriver()
    executor = MockIndexer()
    driver.attach(executor=executor, pea=None)
    assert len(docs) == 10
    for doc in docs:
        assert NdArray(doc.embedding).value is None
    driver._apply_all(docs)
    assert len(docs) == 10
    for doc in docs:
        assert NdArray(doc.embedding).value.shape == (5, )
Ejemplo n.º 18
0
def input_fn():
    doc1 = DocumentProto()
    NdArray(doc1.embedding).value = e1
    c = doc1.chunks.add()
    NdArray(c.embedding).value = e2
    c.id = uid.new_doc_id(c)
    doc2 = DocumentProto()
    NdArray(doc2.embedding).value = e3
    d = doc2.chunks.add()
    d.id = uid.new_doc_id(d)
    NdArray(d.embedding).value = e4
    return [doc1, doc2]
Ejemplo n.º 19
0
def eval_request():
    num_docs = 10
    req = jina_pb2.RequestProto.IndexRequestProto()
    for idx in range(num_docs):
        doc = req.docs.add()
        gt = req.groundtruths.add()
        chunk_doc = doc.chunks.add()
        chunk_gt = gt.chunks.add()
        chunk_doc.granularity = 1
        chunk_gt.granularity = 1
        NdArray(chunk_doc.embedding).value = np.array([1, 1])
        NdArray(chunk_gt.embedding).value = np.array([2, 2])
    return req
Ejemplo n.º 20
0
def input_doc_with_matches():
    doc = jina_pb2.DocumentProto()
    doc.tags['id'] = 1
    match0 = doc.matches.add()
    match0.tags['id'] = 10
    match0.text = text
    NdArray(match0.embedding).value = random_np_array
    match1 = doc.matches.add()
    match1.tags['id'] = 20
    NdArray(match1.blob).value = random_np_array
    match2 = doc.matches.add()
    match2.tags['id'] = 30
    match2.buffer = buffer
    return doc
Ejemplo n.º 21
0
def input_doc_with_chunks():
    doc = jina_pb2.DocumentProto()
    doc.tags['id'] = 1
    chunk0 = doc.chunks.add()
    chunk0.tags['id'] = 10
    chunk0.text = text
    NdArray(chunk0.embedding).value = random_np_array
    chunk1 = doc.chunks.add()
    chunk1.tags['id'] = 20
    NdArray(chunk1.blob).value = random_np_array
    chunk2 = doc.chunks.add()
    chunk2.tags['id'] = 30
    chunk2.buffer = buffer
    return doc
Ejemplo n.º 22
0
def get_output(req):
    np.random.seed(rseed)

    err = 0
    for d in req.docs:
        recv = NdArray(d.embedding).value
        send = np.random.random([embed_dim])
        err += np.sum(np.abs(recv - send)) / embed_dim
        for c in d.chunks:
            recv = NdArray(c.embedding).value
            send = np.random.random([embed_dim])
            err += np.sum(np.abs(recv - send)) / embed_dim

    print(f'reconstruction error: {err / num_docs:.6f}')
Ejemplo n.º 23
0
def test_request_generate_numpy_arrays():
    input_array = np.random.random([10, 10])

    req = request_generator(data=input_array, request_size=5)

    request = next(req)
    assert len(request.index.docs) == 5
    for index, doc in enumerate(request.index.docs, 1):
        assert NdArray(doc.blob).value.shape == (10,)

    request = next(req)
    assert len(request.index.docs) == 5
    for index, doc in enumerate(request.index.docs, 1):
        assert NdArray(doc.blob).value.shape == (10,)
Ejemplo n.º 24
0
def doc_with_multimodal_chunks(embeddings):
    doc = jina_pb2.DocumentProto()
    chunk1 = doc.chunks.add()
    chunk2 = doc.chunks.add()
    chunk3 = doc.chunks.add()
    chunk1.modality = 'visual1'
    chunk2.modality = 'visual2'
    chunk3.modality = 'textual'
    chunk1.id = uid.new_doc_id(chunk1)
    chunk2.id = uid.new_doc_id(chunk2)
    chunk3.id = uid.new_doc_id(chunk3)
    NdArray(chunk1.embedding).value = embeddings[0]
    NdArray(chunk2.embedding).value = embeddings[1]
    NdArray(chunk3.embedding).value = embeddings[2]
    return doc
Ejemplo n.º 25
0
def test_vectorsearch_driver_mock_indexer_with_matches_on_chunks():
    driver = SimpleKVSearchDriver(traversal_paths=('cm',))
    executor = MockIndexer()
    driver.attach(executor=executor, pea=None)
    doc = create_document_to_search_with_matches_on_chunks()

    driver._traverse_apply([doc])

    assert len(doc.chunks) == 1
    chunk = doc.chunks[0]
    assert len(chunk.matches) == 3
    for match in chunk.matches:
        assert NdArray(match.embedding).value is not None
        embedding_array = NdArray(match.embedding).value
        np.testing.assert_equal(embedding_array, np.array([int(match.id)]))
Ejemplo n.º 26
0
 def _apply_all(self,
                docs: Sequence['jina_pb2.DocumentProto'],
                context_doc: 'jina_pb2.DocumentProto',
                field: str,
                concatenate: bool = False,
                *args,
                **kwargs):
     doc = context_doc
     if concatenate:
         NdArray(doc.embedding).value = np.concatenate(
             self.doc_pointers[doc.id], axis=0)
     else:
         if doc.id not in self.doc_pointers:
             self.doc_pointers[doc.id] = [NdArray(doc.embedding).value]
         else:
             self.doc_pointers[doc.id].append(NdArray(doc.embedding).value)
Ejemplo n.º 27
0
 def validate_response(resp):
     is_callback_called._callback_called = True
     assert len(resp.index.docs) == NUM_DOCS
     for idx, doc in enumerate(resp.index.docs):
         np.testing.assert_almost_equal(
             NdArray(doc.embedding).value,
             np.array([idx, idx, idx, idx, idx, 3, 3, 4, 4]))
Ejemplo n.º 28
0
def test_vectorsearch_driver_mock_indexer_with_matches_on_chunks(document_with_matches_on_chunks):
    driver = SimpleKVSearchDriver(traversal_paths=('cm',))
    executor = MockIndexer()
    driver.attach(executor=executor, runtime=None)

    driver._traverse_apply(DocumentSet([document_with_matches_on_chunks]))

    dcs = list(document_with_matches_on_chunks.chunks)
    assert len(dcs) == 1
    chunk = dcs[0]
    matches = list(chunk.matches)
    assert len(matches) == 3
    for match in matches:
        assert NdArray(match.embedding).value is not None
        embedding_array = NdArray(match.embedding).value
        np.testing.assert_equal(embedding_array, np.array([match.id]))
Ejemplo n.º 29
0
 def validate_response(resp):
     mock()
     assert len(resp.index.docs) == NUM_DOCS
     for idx, doc in enumerate(resp.index.docs):
         np.testing.assert_almost_equal(
             NdArray(doc.embedding).value,
             np.array([idx, idx, idx, idx, idx]))
Ejemplo n.º 30
0
def test_as_blob_driver():
    docs = DocumentSet(random_docs(2))
    driver = MockPrediction2DocBlobDriver()
    driver._apply_all(docs)

    for d in docs:
        assert NdArray(d.blob).value.shape == (3, )