コード例 #1
0
def random_docs(num_docs,
                chunks_per_doc=5,
                embed_dim=10,
                jitter=1) -> Iterator['DocumentProto']:
    warnings.warn(
        'since 0.7.11 the introduce of Document primitive type, this '
        'fake-doc generator has been depreciated. Use "random_docs_new_api" instead',
        DeprecationWarning)
    c_id = 3 * num_docs  # avoid collision with docs
    for j in range(num_docs):
        d = jina_pb2.DocumentProto()
        d.tags['id'] = j
        d.text = b'hello world'
        NdArray(d.embedding).value = np.random.random(
            [embed_dim + np.random.randint(0, jitter)])
        d.id = uid.new_doc_id(d)
        for k in range(chunks_per_doc):
            c = d.chunks.add()
            c.text = 'i\'m chunk %d from doc %d' % (c_id, j)
            NdArray(c.embedding).value = np.random.random(
                [embed_dim + np.random.randint(0, jitter)])
            c.tags['id'] = c_id
            c.tags['parent_id'] = j
            c_id += 1
            c.parent_id = d.id
            c.id = uid.new_doc_id(c)
        yield d
コード例 #2
0
def random_queries(num_docs, chunks_per_doc=5):
    for j in range(num_docs):
        d = jina_pb2.DocumentProto()
        d.id = uid.new_doc_id(d)
        for k in range(chunks_per_doc):
            dd = d.chunks.add()
            dd.id = uid.new_doc_id(dd)
            # dd.id = k + 1  # 1-indexed
        yield d
コード例 #3
0
def input_fn():
    doc1 = DocumentProto()
    NdArray(doc1.embedding).value = e1
    c = doc1.chunks.add()
    NdArray(c.embedding).value = e2
    c.id = uid.new_doc_id(c)
    doc2 = DocumentProto()
    NdArray(doc2.embedding).value = e3
    d = doc2.chunks.add()
    d.id = uid.new_doc_id(d)
    NdArray(d.embedding).value = e4
    return [doc1, doc2]
コード例 #4
0
ファイル: app.py プロジェクト: saman-moeinsadat/examples
def query_generator(image_paths, text_queries):
    for image_path, text in zip(image_paths, text_queries):
        doc = jina_pb2.DocumentProto()
        chunk1 = doc.chunks.add()
        chunk2 = doc.chunks.add()
        chunk1.modality = 'image'
        chunk2.modality = 'text'
        chunk1.id = uid.new_doc_id(chunk1)
        chunk2.id = uid.new_doc_id(chunk2)
        with open(image_path, 'rb') as fp:
            chunk1.buffer = fp.read()
        chunk2.text = text
        yield doc
コード例 #5
0
    def input_fn():
        doc1 = jina_pb2.DocumentProto()
        doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
        doc1.id = uid.new_doc_id(doc1)

        doc2 = jina_pb2.DocumentProto()
        doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
        doc2.id = uid.new_doc_id(doc2)

        doc3 = jina_pb2.DocumentProto()
        doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
        doc3.id = uid.new_doc_id(doc3)

        return [doc1, doc2, doc3]
コード例 #6
0
def doc_with_multimodal_chunks(embeddings):
    doc = jina_pb2.DocumentProto()
    chunk1 = doc.chunks.add()
    chunk2 = doc.chunks.add()
    chunk3 = doc.chunks.add()
    chunk1.modality = 'visual1'
    chunk2.modality = 'visual2'
    chunk3.modality = 'textual'
    chunk1.id = uid.new_doc_id(chunk1)
    chunk2.id = uid.new_doc_id(chunk2)
    chunk3.id = uid.new_doc_id(chunk3)
    NdArray(chunk1.embedding).value = embeddings[0]
    NdArray(chunk2.embedding).value = embeddings[1]
    NdArray(chunk3.embedding).value = embeddings[2]
    return doc
コード例 #7
0
ファイル: test_segmenter_driver.py プロジェクト: tyunist/jina
def test_segment_driver():
    valid_doc = jina_pb2.DocumentProto()
    valid_doc.id = uid.new_doc_id(valid_doc)
    valid_doc.text = 'valid'
    valid_doc.length = 2
    valid_doc.mime_type = 'image/png'

    driver = SimpleSegmentDriver()
    executor = MockSegmenter()
    driver.attach(executor=executor, pea=None)
    driver._apply_all([valid_doc])

    assert valid_doc.length == 2

    assert valid_doc.chunks[0].tags['id'] == 3
    assert valid_doc.chunks[0].parent_id == valid_doc.id
    np.testing.assert_equal(NdArray(valid_doc.chunks[0].blob).value, np.array([0.0, 0.0, 0.0]))
    assert valid_doc.chunks[0].weight == 0
    assert valid_doc.chunks[0].length == 3
    assert valid_doc.chunks[0].mime_type == 'text/plain'

    assert valid_doc.chunks[1].tags['id'] == 4
    assert valid_doc.chunks[1].parent_id == valid_doc.id
    np.testing.assert_equal(NdArray(valid_doc.chunks[1].blob).value, np.array([1.0, 1.0, 1.0]))
    assert valid_doc.chunks[1].weight == 1
    assert valid_doc.chunks[1].length == 3
    assert valid_doc.chunks[1].mime_type == 'image/png'

    assert valid_doc.chunks[2].tags['id'] == 5
    assert valid_doc.chunks[2].parent_id == valid_doc.id
    np.testing.assert_equal(NdArray(valid_doc.chunks[2].blob).value, np.array([2.0, 2.0, 2.0]))
    assert valid_doc.chunks[2].weight == 2
    assert valid_doc.chunks[2].length == 3
    assert valid_doc.chunks[2].mime_type == 'image/png'
コード例 #8
0
ファイル: __init__.py プロジェクト: tyunist/jina
def random_docs(num_docs, chunks_per_doc=5, embed_dim=10, jitter=1):
    c_id = 3 * num_docs  # avoid collision with docs
    for j in range(num_docs):
        d = jina_pb2.DocumentProto()
        d.tags['id'] = j
        d.text = b'hello world'
        NdArray(d.embedding).value = np.random.random([embed_dim + np.random.randint(0, jitter)])
        d.id = uid.new_doc_id(d)
        for k in range(chunks_per_doc):
            c = d.chunks.add()
            c.text = 'i\'m chunk %d from doc %d' % (c_id, j)
            NdArray(c.embedding).value = np.random.random([embed_dim + np.random.randint(0, jitter)])
            c.tags['id'] = c_id
            c.tags['parent_id'] = j
            c_id += 1
            c.parent_id = d.id
            c.id = uid.new_doc_id(c)
        yield d
コード例 #9
0
ファイル: test_segmenter_driver.py プロジェクト: tyunist/jina
def test_broken_document():
    driver = SimpleSegmentDriver()
    executor = MockSegmenter()
    driver.attach(executor=executor, pea=None)

    invalid_doc = jina_pb2.DocumentProto()
    invalid_doc.id = uid.new_doc_id(invalid_doc)
    invalid_doc.text = 'invalid'
    invalid_doc.length = 2

    assert invalid_doc.length == 2

    with pytest.raises(AttributeError):
        driver._apply_all([invalid_doc])
コード例 #10
0
ファイル: test_kv_index_driver.py プロジェクト: tyunist/jina
def documents():
    docs = []
    # doc: 1
    # doc: 2
    # doc: 3
    # doc: 4
    # doc: 5
    for idx in range(5):
        doc = jina_pb2.DocumentProto()
        doc.text = str(idx + 1)
        doc.id = uid.new_doc_id(doc)
        docs.append(doc)

    return docs