Esempio n. 1
0
 def index_documents():
     """Index Documents:
         doc: tag__id = 0
              tag__dummy_score = 0
              embedding = 0
         doc: tag__id = 1
              tag__dummy_score = -1
              embedding = 1
         doc: tag__id = 2
              tag__dummy_score = -2
              embedding = 2
     """
     doc0 = jina_pb2.DocumentProto()
     doc0.tags['id'] = '0'
     doc0.tags['dummy_score'] = 0
     NdArray(doc0.embedding).value = np.array([0])
     doc1 = jina_pb2.DocumentProto()
     doc1.tags['id'] = '1'
     doc1.tags['dummy_score'] = -1
     NdArray(doc1.embedding).value = np.array([1])
     doc2 = jina_pb2.DocumentProto()
     doc2.tags['id'] = '2'
     doc2.tags['dummy_score'] = -2
     NdArray(doc2.embedding).value = np.array([2])
     return [doc0, doc1, doc2]
Esempio n. 2
0
def test_docgroundtruth_pair():
    def add_matches(doc: jina_pb2.DocumentProto, num_matches):
        for idx in range(num_matches):
            match = doc.matches.add()
            match.adjacency = doc.adjacency + 1

    def add_chunks(doc: jina_pb2.DocumentProto, num_chunks):
        for idx in range(num_chunks):
            chunk = doc.chunks.add()
            chunk.granularity = doc.granularity + 1

    doc = jina_pb2.DocumentProto()
    gt = jina_pb2.DocumentProto()
    add_matches(doc, 3)
    add_matches(gt, 3)
    add_chunks(doc, 3)
    add_chunks(gt, 3)

    pair = DocGroundtruthPair(doc, gt)

    j = 0
    for chunk_pair in pair.chunks:
        assert chunk_pair.doc.granularity == 1
        assert chunk_pair.groundtruth.granularity == 1
        j += 1

    k = 0
    for match_pair in pair.matches:
        assert match_pair.doc.adjacency == 1
        assert match_pair.groundtruth.adjacency == 1
        k += 1

    assert j == 3
    assert k == 3
Esempio n. 3
0
def test_flow_with_modalities(tmpdir):
    os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE'] = str(tmpdir)

    def input_fn():
        doc1 = jina_pb2.DocumentProto()
        doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
        doc1.id = uid.new_doc_id(doc1)

        doc2 = jina_pb2.DocumentProto()
        doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
        doc2.id = uid.new_doc_id(doc2)

        doc3 = jina_pb2.DocumentProto()
        doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
        doc3.id = uid.new_doc_id(doc3)

        return [doc1, doc2, doc3]

    flow = Flow().add(name='crafter', uses='!MockSegmenter'). \
        add(name='encoder1', uses=str(cur_dir / 'yaml' / 'mockencoder-mode1.yml')). \
        add(name='indexer1', uses=str(cur_dir / 'yaml' / 'numpy-indexer-1.yml'), needs=['encoder1']). \
        add(name='encoder2', uses=str(cur_dir / 'yaml' / 'mockencoder-mode2.yml'), needs=['crafter']). \
        add(name='indexer2', uses=str(cur_dir / 'yaml' / 'numpy-indexer-2.yml')). \
        join(['indexer1', 'indexer2'])

    with flow:
        flow.index(input_fn=input_fn, override_doc_id=False)

    with open(tmpdir.join('vec1.gz'), 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(result, np.array([[0.0, 0.0, 0.0],
                                                  [0.0, 0.0, 0.0],
                                                  [0.0, 0.0, 0.0]]))

    with open(tmpdir.join('vec2.gz'), 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(result, np.array([[1.0, 1.0, 1.0],
                                                  [1.0, 1.0, 1.0],
                                                  [1.0, 1.0, 1.0]]))

    chunkIndexer1 = BinaryPbIndexer.load(tmpdir.join('kvidx1.bin'))
    assert chunkIndexer1.size == 3
    d_id = list(chunkIndexer1.query_handler.header.keys())[0]

    query_doc = jina_pb2.DocumentProto()
    query_doc.ParseFromString(chunkIndexer1.query(d_id))
    assert query_doc.text == 'title: this is mode1 from doc1'
    assert query_doc.modality == 'mode1'

    chunkIndexer2 = BinaryPbIndexer.load(tmpdir.join('kvidx2.bin'))
    assert chunkIndexer2.size == 3
    d_id = list(chunkIndexer2.query_handler.header.keys())[0]

    query_doc = jina_pb2.DocumentProto()
    query_doc.ParseFromString(chunkIndexer2.query(d_id))
    assert query_doc.text == ' body: this is mode2 from doc1'
    assert query_doc.modality == 'mode2'

    del os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE']
Esempio n. 4
0
def create_documents_to_craft():
    doc1 = jina_pb2.DocumentProto()
    # doc1.id = 1
    doc1.text = 'valid'
    doc2 = jina_pb2.DocumentProto()
    # doc2.id = 2
    doc2.text = 'invalid'
    return [doc1, doc2]
Esempio n. 5
0
def random_docs_to_chunk():
    d1 = jina_pb2.DocumentProto()
    d1.tags['id'] = 1
    d1.text = 'chunk1 chunk2'
    yield d1
    d2 = jina_pb2.DocumentProto()
    d2.tags['id'] = 1
    d2.text = 'chunk3'
    yield d2
Esempio n. 6
0
def ground_truth_pairs():
    num_docs = 10
    pairs = []
    for idx in range(num_docs):
        doc = jina_pb2.DocumentProto()
        gt = jina_pb2.DocumentProto()
        NdArray(doc.embedding).value = np.array([1, 1])
        NdArray(gt.embedding).value = np.array([2, 2])
        pairs.append(DocGroundtruthPair(doc=doc, groundtruth=gt))
    return pairs
Esempio n. 7
0
def random_docs_with_tags():
    d1 = jina_pb2.DocumentProto()
    d1.tags['id'] = 1
    d1.text = 'a'
    d1.tags.update({'id': 1})
    yield d1
    d2 = jina_pb2.DocumentProto()
    d2.tags['id'] = 2
    d2.tags.update({'id': 2})
    d2.text = 'b'
    yield d2
Esempio n. 8
0
    def input_fn():
        doc1 = jina_pb2.DocumentProto()
        doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
        doc1.id = uid.new_doc_id(doc1)

        doc2 = jina_pb2.DocumentProto()
        doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
        doc2.id = uid.new_doc_id(doc2)

        doc3 = jina_pb2.DocumentProto()
        doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
        doc3.id = uid.new_doc_id(doc3)

        return [doc1, doc2, doc3]
Esempio n. 9
0
    def input_fn():
        doc1 = jina_pb2.DocumentProto()
        doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
        doc1.id = UniqueId(1)

        doc2 = jina_pb2.DocumentProto()
        doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
        doc2.id = UniqueId(2)

        doc3 = jina_pb2.DocumentProto()
        doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
        doc3.id = UniqueId(3)

        return [doc1, doc2, doc3]
Esempio n. 10
0
    def input_function():
        doc1 = jina_pb2.DocumentProto()
        doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
        doc1.id = '1'

        doc2 = jina_pb2.DocumentProto()
        doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
        doc2.id = '2'

        doc3 = jina_pb2.DocumentProto()
        doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
        doc3.id = '3'

        return [doc1, doc2, doc3]
Esempio n. 11
0
def index_groundtruth():
    """Index Groundtruth:
        doc: id = 00
             tag__groundtruth = True
             text = aa
        doc: id = 01
             tag__groundtruth = True
             text = aa
        doc: id = 02
             tag__groundtruth = True
             text = aa
        ... we will not have groundtruth for id 5, 10, 50
    """
    docs = []
    for idx in range(0, 100):
        doc = jina_pb2.DocumentProto()
        doc.id = f'{idx}'
        # Invalid ids if odd length https://github.com/jina-ai/jina/issues/1125
        if len(doc.id) % 2 != 0:
            doc.id = f'0{doc.id}'
        doc.tags['groundtruth'] = True
        doc.text = 'aa'
        if idx not in (5, 10, 50):
            docs.append(doc)
    return docs
Esempio n. 12
0
def test_segment_driver():
    valid_doc = jina_pb2.DocumentProto()
    valid_doc.id = uid.new_doc_id(valid_doc)
    valid_doc.text = 'valid'
    valid_doc.length = 2
    valid_doc.mime_type = 'image/png'

    driver = SimpleSegmentDriver()
    executor = MockSegmenter()
    driver.attach(executor=executor, pea=None)
    driver._apply_all([valid_doc])

    assert valid_doc.length == 2

    assert valid_doc.chunks[0].tags['id'] == 3
    assert valid_doc.chunks[0].parent_id == valid_doc.id
    np.testing.assert_equal(NdArray(valid_doc.chunks[0].blob).value, np.array([0.0, 0.0, 0.0]))
    assert valid_doc.chunks[0].weight == 0
    assert valid_doc.chunks[0].length == 3
    assert valid_doc.chunks[0].mime_type == 'text/plain'

    assert valid_doc.chunks[1].tags['id'] == 4
    assert valid_doc.chunks[1].parent_id == valid_doc.id
    np.testing.assert_equal(NdArray(valid_doc.chunks[1].blob).value, np.array([1.0, 1.0, 1.0]))
    assert valid_doc.chunks[1].weight == 1
    assert valid_doc.chunks[1].length == 3
    assert valid_doc.chunks[1].mime_type == 'image/png'

    assert valid_doc.chunks[2].tags['id'] == 5
    assert valid_doc.chunks[2].parent_id == valid_doc.id
    np.testing.assert_equal(NdArray(valid_doc.chunks[2].blob).value, np.array([2.0, 2.0, 2.0]))
    assert valid_doc.chunks[2].weight == 2
    assert valid_doc.chunks[2].length == 3
    assert valid_doc.chunks[2].mime_type == 'image/png'
Esempio n. 13
0
def random_docs(num_docs,
                chunks_per_doc=5,
                embed_dim=10,
                jitter=1) -> Iterator['DocumentProto']:
    warnings.warn(
        'since 0.7.11 the introduce of Document primitive type, this '
        'fake-doc generator has been depreciated. Use "random_docs_new_api" instead',
        DeprecationWarning)
    c_id = 3 * num_docs  # avoid collision with docs
    for j in range(num_docs):
        d = jina_pb2.DocumentProto()
        d.tags['id'] = j
        d.text = b'hello world'
        NdArray(d.embedding).value = np.random.random(
            [embed_dim + np.random.randint(0, jitter)])
        d.id = uid.new_doc_id(d)
        for k in range(chunks_per_doc):
            c = d.chunks.add()
            c.text = 'i\'m chunk %d from doc %d' % (c_id, j)
            NdArray(c.embedding).value = np.random.random(
                [embed_dim + np.random.randint(0, jitter)])
            c.tags['id'] = c_id
            c.tags['parent_id'] = j
            c_id += 1
            c.parent_id = d.id
            c.id = uid.new_doc_id(c)
        yield d
Esempio n. 14
0
def test_queryset_with_struct(random_workspace, mocker):
    total_docs = 4
    docs = []
    for doc_id in range(total_docs):
        doc = jina_pb2.DocumentProto()
        doc.text = f'I am doc{doc_id}'
        NdArray(doc.embedding).value = np.array([doc_id])
        doc.tags['label'] = f'label{doc_id % 2 + 1}'
        docs.append(doc)

    f = (Flow()
         .add(uses='- !FilterQL | {lookups: {tags__label__in: [label1, label2]}, traversal_paths: [r]}'))

    def validate_all_docs(resp):
        assert len(resp.docs) == total_docs

    def validate_label2_docs(resp):
        assert len(resp.docs) == total_docs / 2

    mock1 = mocker.Mock()
    mock2 = mocker.Mock()
    with f:
        # keep all the docs
        f.index(docs, on_done=mock1)
        # keep only the docs with label2
        qs = QueryLang({'name': 'FilterQL', 'priority': 1, 'parameters': {'lookups': {'tags__label': 'label2'}, 'traversal_paths': ['r']}})
        f.index(docs, queryset=qs, on_done=mock2)

    mock1.assert_called_once()
    validate_callback(mock1, validate_all_docs)
    mock2.assert_called_once()
    validate_callback(mock2, validate_label2_docs)
Esempio n. 15
0
def test_queryset_with_struct(random_workspace):
    total_docs = 4
    docs = []
    for doc_id in range(total_docs):
        doc = jina_pb2.DocumentProto()
        doc.text = f'I am doc{doc_id}'
        NdArray(doc.embedding).value = np.array([doc_id])
        doc.tags['label'] = f'label{doc_id % 2 + 1}'
        docs.append(doc)

    f = (Flow()
         .add(uses='- !FilterQL | {lookups: {tags__label__in: [label1, label2]}, traversal_paths: [r]}'))

    def validate_all_docs(resp):
        assert len(resp.docs) == total_docs

    def validate_label2_docs(resp):
        assert len(resp.docs) == total_docs / 2

    with f:
        # keep all the docs
        f.index(docs, output_fn=validate_all_docs, callback_on='body')

        # keep only the docs with label2
        qs = jina_pb2.QueryLangProto(name='FilterQL', priority=1)
        qs.parameters['lookups'] = {'tags__label': 'label2'}
        qs.parameters['traversal_paths'] = ['r']
        f.index(docs, queryset=qs, output_fn=validate_label2_docs, callback_on='body')
Esempio n. 16
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     doc1 = jina_pb2.DocumentProto()
     doc1.id = '01'
     doc1.tags['groundtruth'] = True
     doc2 = jina_pb2.DocumentProto()
     doc2.id = '02'
     doc2.tags['groundtruth'] = True
     doc4 = jina_pb2.DocumentProto()
     doc4.id = '04'
     doc4.tags['groundtruth'] = True
     self.db = {
         uid.id2hash(doc1.id): doc1.SerializeToString(),
         uid.id2hash(doc2.id): doc2.SerializeToString(),
         uid.id2hash(doc4.id): doc4.SerializeToString()
     }
Esempio n. 17
0
def create_chunk_chunk_matches_to_score():
    # doc: (id: 100, granularity=0)
    # |- chunk: (id: 101, granularity=1)
    #       |- chunks: (id: 10)
    #       |   |- matches: (id: 11, parent_id: 1, score.value: 2),
    #       |   |- matches: (id: 12, parent_id: 1, score.value: 3),
    #       |- chunks: (id: 20)
    #           |- matches: (id: 21, parent_id: 2, score.value: 4),
    #           |- matches: (id: 22, parent_id: 2, score.value: 5)
    doc = jina_pb2.DocumentProto()
    doc.id = '100'
    doc.granularity = 0
    chunk = doc.chunks.add()
    chunk.id = '101'
    chunk.parent_id = doc.id
    chunk.granularity = doc.granularity + 1
    num_matches = 2
    for parent_id in range(1, 3):
        chunk_chunk = chunk.chunks.add()
        chunk_chunk.id = str(parent_id * 10)
        chunk_chunk.parent_id = str(parent_id)
        chunk_chunk.granularity = chunk.granularity + 1
        for score_value in range(parent_id * 2, parent_id * 2 + num_matches):
            match = chunk_chunk.matches.add()
            match.parent_id = str(parent_id)
            match.score.value = score_value
            match.score.ref_id = chunk_chunk.id
            match.id = str(10 * parent_id + score_value)
            match.length = 4
    return Document(doc)
Esempio n. 18
0
def create_document_to_score():
    # doc: 1
    # |- chunk: 2
    # |  |- matches: (id: 4, parent_id: 40, score.value: 4),
    # |  |- matches: (id: 5, parent_id: 50, score.value: 5),
    # |
    # |- chunk: 3
    #    |- matches: (id: 6, parent_id: 60, score.value: 6),
    #    |- matches: (id: 7, parent_id: 70, score.value: 7)
    doc = jina_pb2.DocumentProto()
    doc.id = '1' * 16
    for c in range(2):
        chunk = doc.chunks.add()
        chunk_id = str(c + 2)
        chunk.id = chunk_id * 16
        for m in range(2):
            match = chunk.matches.add()
            match_id = 2 * int(chunk_id) + m
            match.id = str(match_id) * 16
            parent_id = 10 * int(match_id)
            match.parent_id = str(parent_id) * 8
            match.length = int(match_id)
            # to be used by MaxRanker and MinRanker
            match.score.ref_id = chunk.id
            match.score.value = int(match_id)
    return Document(doc)
Esempio n. 19
0
def create_documents_to_encode(num_docs):
    docs = []
    for idx in range(num_docs):
        doc = jina_pb2.DocumentProto()
        NdArray(doc.blob).value = np.array([idx])
        docs.append(doc)
    return docs
Esempio n. 20
0
def test_shelf_in_flow(uses):
    m1 = used_memory()
    # shelve does not support embed > 1000??
    # _dbm.error: cannot add item to database
    # HASH: Out of overflow pages.  Increase page size
    docs = random_docs(10000, embed_dim=1000)
    f = Flow(callback_on='body').add(uses=os.path.join(cur_dir, uses))

    with f:
        f.index(docs)

    m2 = used_memory()
    d = jina_pb2.DocumentProto()

    def validate(req):
        m4 = used_memory()
        print(
            f'before: {m1}, after index: {m2}, after loading: {m3} after searching {m4}'
        )

    with f:
        m3 = used_memory()
        f.search([d], output_fn=validate)

    shutil.rmtree('test-workspace', ignore_errors=False, onerror=None)
Esempio n. 21
0
def multimodal_all_types_documents():
    docs = []
    for idx in range(0, NUM_DOCS):
        """
        doc - idx
            |
            | - chunk - embedding [idx, idx] - modality1
            | - chunk - blob [idx, idx, idx] - modality2
            | - chunk - text 'modality3' - modality3 -> Inside multimodal encoder will be encoded into [3, 3]
            | - chunk - buffer b'modality4' - modality4 -> Inside multimodal encoder will be encoded into [4, 4]
        Result:
            doc - idx - embedding [idx, idx, idx, idx, idx, 3, 3, 4, 4]
        """
        doc = jina_pb2.DocumentProto()
        doc.text = f'{idx}'

        for modality in ['modality1', 'modality2', 'modality3', 'modality4']:
            chunk = doc.chunks.add()
            chunk.modality = modality
            if modality == 'modality1':
                NdArray(chunk.embedding).value = np.array([idx, idx])
            elif modality == 'modality2':
                NdArray(chunk.blob).value = np.array([idx, idx, idx])
            elif modality == 'modality3':
                chunk.text = 'modality3'
            elif modality == 'modality4':
                chunk.buffer = 'modality4'.encode()
        docs.append(doc)
    return docs
Esempio n. 22
0
 def random_docs(num_docs):
     for j in range(1, num_docs + 1):
         doc = jina_pb2.DocumentProto()
         doc.text = f'i\'m dummy doc {j}'
         doc.offset = 1000
         doc.tags['id'] = 1000  # this will be ignored
         doc.mime_type = 'mime_type'
         yield doc
Esempio n. 23
0
def random_docs(num_docs, embed_dim=10, jitter=1):
    for j in range(num_docs):
        d = jina_pb2.DocumentProto()
        d.tags['id'] = j
        d.text = b'hello'
        NdArray(d.embedding).value = np.random.random(
            [embed_dim + np.random.randint(0, jitter)])
        yield d
Esempio n. 24
0
def test_lazy_append_access():
    reqs = (Request(r.SerializeToString(), EnvelopeProto()) for r in _generate(random_docs(10)))
    for r in reqs:
        assert not r.is_used
        # write access r.train
        r.docs.append(jina_pb2.DocumentProto())
        # now it is read
        assert r.is_used
Esempio n. 25
0
def random_queries(num_docs, chunks_per_doc=5):
    for j in range(num_docs):
        d = jina_pb2.DocumentProto()
        d.id = UniqueId(j)
        for k in range(chunks_per_doc):
            dd = d.chunks.add()
            dd.id = UniqueId(num_docs + j * chunks_per_doc + k)
        yield d
def ground_truth_pairs():
    num_docs = 10

    def add_matches(doc: jina_pb2.DocumentProto, num_matches):
        for idx in range(num_matches):
            match = doc.matches.add()
            match.tags['id'] = idx
            match.score.value = idx

    pairs = []
    for idx in range(num_docs):
        doc = jina_pb2.DocumentProto()
        gt = jina_pb2.DocumentProto()
        add_matches(doc, num_docs)
        add_matches(gt, num_docs)
        pairs.append(DocGroundtruthPair(doc=doc, groundtruth=gt))
    return pairs
Esempio n. 27
0
def random_queries(num_docs, chunks_per_doc=5):
    for j in range(num_docs):
        d = jina_pb2.DocumentProto()
        d.id = uid.new_doc_id(d)
        for k in range(chunks_per_doc):
            dd = d.chunks.add()
            dd.id = uid.new_doc_id(dd)
            # dd.id = k + 1  # 1-indexed
        yield d
Esempio n. 28
0
 def create(self):
     gt = jina_pb2.DocumentProto()
     if field_type == 'text':
         gt.text = 'aaaa'
     elif field_type == 'buffer':
         gt.buffer = b'\x01\x02\x03\04'
     elif field_type == 'blob':
         NdArray(gt.blob).value = np.array([1, 1, 1, 1])
     return gt
Esempio n. 29
0
def random_docs_with_chunks_and_matches(num_docs):
    # doc |- chunk |- chunk
    #     |        |- chunk
    #     |        |- match | - chunk
    #                       | - chunk
    #     |        |- match
    #     |- chunk
    #     |- chunk
    #     |- match | - chunk
    #              | - chunk
    docs = []
    for j in range(num_docs):
        d = jina_pb2.DocumentProto()
        d.granularity = 0
        d.tags['id'] = j
        d.text = 'hello world'
        d.uri = 'doc://'
        for c in range(10):
            dc = d.chunks.add()
            dc.text = 'chunk to hello world'
            dc.granularity = d.granularity + 1
            dc.uri = 'doc://chunk'
            dc.tags['id'] = c
            for cc in range(10):
                dcc = dc.chunks.add()
                dcc.text = 'nested chunk to chunk'
                dcc.uri = 'doc://chunk/chunk'
                dcc.tags['id'] = cc
                dcc.granularity = dc.granularity + 1
            for m in range(10):
                cm = dc.matches.add()
                cm.text = 'match to chunk to hello-world'
                cm.uri = 'doc://chunk/match'
                cm.tags['id'] = m
                cm.granularity = dc.granularity
                for mc in range(10):
                    cmc = cm.chunks.add()
                    cmc.text = 'chunk to match to chunk to hello-world'
                    cmc.uri = 'doc://chunk/match/chunk'
                    cmc.tags['id'] = mc
                    cmc.granularity = cm.granularity + 1
        for m in range(10):
            dm = d.matches.add()
            dm.text = 'match to hello-world'
            dm.uri = 'doc://match'
            dm.tags['id'] = m
            dm.granularity = d.granularity
            for c in range(10):
                dmc = dm.chunks.add()
                dmc.text = 'chunk to match to hello-world'
                dmc.uri = 'doc://match/chunk'
                dmc.tags['id'] = m
                dmc.granularity = dm.granularity + 1

        docs.append(d)
    return DocumentArray(docs)
Esempio n. 30
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     doc1 = jina_pb2.DocumentProto()
     doc1.id = '1'
     NdArray(doc1.embedding).value = np.array([int(doc1.id)])
     doc2 = jina_pb2.DocumentProto()
     doc2.id = '2'
     NdArray(doc2.embedding).value = np.array([int(doc2.id)])
     doc3 = jina_pb2.DocumentProto()
     doc3.id = '3'
     NdArray(doc3.embedding).value = np.array([int(doc3.id)])
     doc4 = jina_pb2.DocumentProto()
     doc4.id = '4'
     NdArray(doc4.embedding).value = np.array([int(doc4.id)])
     self.db = {
         1: doc1.SerializeToString(),
         2: doc2.SerializeToString(),
         3: doc3.SerializeToString(),
         4: doc4.SerializeToString()
     }