Exemple #1
0
def random_queries(num_docs, chunks_per_doc=5):
    for j in range(num_docs):
        d = jina_pb2.DocumentProto()
        d.id = uid.new_doc_id(d)
        for k in range(chunks_per_doc):
            dd = d.chunks.add()
            dd.id = uid.new_doc_id(dd)
            # dd.id = k + 1  # 1-indexed
        yield d
Exemple #2
0
def input_fn():
    doc1 = Document()
    GenericNdArray(doc1.embedding).value = e1
    c = doc1.chunks.add()
    GenericNdArray(c.embedding).value = e2
    c.id = uid.new_doc_id(c)
    doc2 = Document()
    GenericNdArray(doc2.embedding).value = e3
    d = doc2.chunks.add()
    d.id = uid.new_doc_id(d)
    GenericNdArray(d.embedding).value = e4
    return [doc1, doc2]
Exemple #3
0
def input_fn():
    doc1 = Document()
    doc1.embedding.CopyFrom(array2pb(e1))
    c = doc1.chunks.add()
    c.embedding.CopyFrom(array2pb(e2))
    c.id = uid.new_doc_id(c)
    doc2 = Document()
    doc2.embedding.CopyFrom(array2pb(e3))
    d = doc2.chunks.add()
    d.id = uid.new_doc_id(d)
    d.embedding.CopyFrom(array2pb(e4))
    return [doc1, doc2]
    def input_fn():
        doc1 = jina_pb2.Document()
        doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
        doc1.id = uid.new_doc_id(doc1)

        doc2 = jina_pb2.Document()
        doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
        doc2.id = uid.new_doc_id(doc2)

        doc3 = jina_pb2.Document()
        doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
        doc3.id = uid.new_doc_id(doc3)

        return [doc1, doc2, doc3]
def doc_with_multimodal_chunks(embeddings):
    doc = jina_pb2.DocumentProto()
    chunk1 = doc.chunks.add()
    chunk2 = doc.chunks.add()
    chunk3 = doc.chunks.add()
    chunk1.modality = 'visual1'
    chunk2.modality = 'visual2'
    chunk3.modality = 'textual'
    chunk1.id = uid.new_doc_id(chunk1)
    chunk2.id = uid.new_doc_id(chunk2)
    chunk3.id = uid.new_doc_id(chunk3)
    NdArray(chunk1.embedding).value = embeddings[0]
    NdArray(chunk2.embedding).value = embeddings[1]
    NdArray(chunk3.embedding).value = embeddings[2]
    return doc
Exemple #6
0
def doc_with_multimodal_chunks_wrong(embeddings):
    doc = jina_pb2.Document()
    chunk1 = doc.chunks.add()
    chunk2 = doc.chunks.add()
    chunk3 = doc.chunks.add()
    chunk1.modality = 'visual'
    chunk2.modality = 'visual'
    chunk3.modality = 'textual'
    chunk1.id = uid.new_doc_id(chunk1)
    chunk2.id = uid.new_doc_id(chunk2)
    chunk3.id = uid.new_doc_id(chunk3)
    GenericNdArray(chunk1.embedding).value = embeddings[0]
    GenericNdArray(chunk2.embedding).value = embeddings[1]
    GenericNdArray(chunk3.embedding).value = embeddings[2]
    return doc
def doc_with_multimodal_chunks_wrong(embeddings):
    doc = jina_pb2.Document()
    chunk1 = doc.chunks.add()
    chunk2 = doc.chunks.add()
    chunk3 = doc.chunks.add()
    chunk1.modality = 'visual'
    chunk2.modality = 'visual'
    chunk3.modality = 'textual'
    chunk1.id = uid.new_doc_id(chunk1)
    chunk2.id = uid.new_doc_id(chunk2)
    chunk3.id = uid.new_doc_id(chunk3)
    chunk1.embedding.CopyFrom(array2pb(embeddings[0]))
    chunk2.embedding.CopyFrom(array2pb(embeddings[1]))
    chunk3.embedding.CopyFrom(array2pb(embeddings[2]))
    return doc
Exemple #8
0
def test_segment_driver():
    valid_doc = jina_pb2.Document()
    valid_doc.id = uid.new_doc_id(valid_doc)
    valid_doc.text = 'valid'
    valid_doc.length = 2
    valid_doc.mime_type = 'image/png'

    driver = SimpleSegmentDriver()
    executor = MockSegmenter()
    driver.attach(executor=executor, pea=None)
    driver._apply(valid_doc)

    assert valid_doc.length == 2

    assert valid_doc.chunks[0].tags['id'] == 3
    assert valid_doc.chunks[0].parent_id == valid_doc.id
    assert valid_doc.chunks[0].blob == array2pb(np.array([0.0, 0.0, 0.0]))
    assert valid_doc.chunks[0].weight == 0
    assert valid_doc.chunks[0].length == 3
    assert valid_doc.chunks[0].mime_type == 'text/plain'

    assert valid_doc.chunks[1].tags['id'] == 4
    assert valid_doc.chunks[1].parent_id == valid_doc.id
    assert valid_doc.chunks[1].blob == array2pb(np.array([1.0, 1.0, 1.0]))
    assert valid_doc.chunks[1].weight == 1
    assert valid_doc.chunks[1].length == 3
    assert valid_doc.chunks[1].mime_type == 'image/png'

    assert valid_doc.chunks[2].tags['id'] == 5
    assert valid_doc.chunks[2].parent_id == valid_doc.id
    assert valid_doc.chunks[2].blob == array2pb(np.array([2.0, 2.0, 2.0]))
    assert valid_doc.chunks[2].weight == 2
    assert valid_doc.chunks[2].length == 3
    assert valid_doc.chunks[2].mime_type == 'image/png'
Exemple #9
0
def random_docs(num_docs, chunks_per_doc=5, embed_dim=10, jitter=1):
    c_id = 3 * num_docs  # avoid collision with docs
    for j in range(num_docs):
        d = jina_pb2.DocumentProto()
        d.tags['id'] = j
        d.text = b'hello world'
        NdArray(d.embedding).value = np.random.random([embed_dim + np.random.randint(0, jitter)])
        d.id = uid.new_doc_id(d)
        for k in range(chunks_per_doc):
            c = d.chunks.add()
            c.text = 'i\'m chunk %d from doc %d' % (c_id, j)
            NdArray(c.embedding).value = np.random.random([embed_dim + np.random.randint(0, jitter)])
            c.tags['id'] = c_id
            c.tags['parent_id'] = j
            c_id += 1
            c.parent_id = d.id
            c.id = uid.new_doc_id(c)
        yield d
Exemple #10
0
def random_docs(num_docs, chunks_per_doc=5, embed_dim=10, jitter=1):
    c_id = 3 * num_docs  # avoid collision with docs
    for j in range(num_docs):
        d = jina_pb2.Document()
        d.tags['id'] = j
        d.text = b'hello world doc id %d' % j
        d.embedding.CopyFrom(
            array2pb(
                np.random.random([embed_dim + np.random.randint(0, jitter)])))
        d.id = uid.new_doc_id(d)
        yield d
Exemple #11
0
def test_broken_document():
    driver = SimpleSegmentDriver()
    executor = MockSegmenter()
    driver.attach(executor=executor, pea=None)

    invalid_doc = jina_pb2.Document()
    invalid_doc.id = uid.new_doc_id(invalid_doc)
    invalid_doc.text = 'invalid'
    invalid_doc.length = 2

    assert invalid_doc.length == 2

    with pytest.raises(AttributeError):
        driver._apply(invalid_doc)
def documents():
    docs = []
    # doc: 1
    # doc: 2
    # doc: 3
    # doc: 4
    # doc: 5
    for idx in range(5):
        doc = jina_pb2.Document()
        doc.text = str(idx + 1)
        doc.id = uid.new_doc_id(doc)
        docs.append(doc)

    return docs