def random_queries(num_docs, chunks_per_doc=5): for j in range(num_docs): d = jina_pb2.DocumentProto() d.id = uid.new_doc_id(d) for k in range(chunks_per_doc): dd = d.chunks.add() dd.id = uid.new_doc_id(dd) # dd.id = k + 1 # 1-indexed yield d
def input_fn(): doc1 = Document() GenericNdArray(doc1.embedding).value = e1 c = doc1.chunks.add() GenericNdArray(c.embedding).value = e2 c.id = uid.new_doc_id(c) doc2 = Document() GenericNdArray(doc2.embedding).value = e3 d = doc2.chunks.add() d.id = uid.new_doc_id(d) GenericNdArray(d.embedding).value = e4 return [doc1, doc2]
def input_fn(): doc1 = Document() doc1.embedding.CopyFrom(array2pb(e1)) c = doc1.chunks.add() c.embedding.CopyFrom(array2pb(e2)) c.id = uid.new_doc_id(c) doc2 = Document() doc2.embedding.CopyFrom(array2pb(e3)) d = doc2.chunks.add() d.id = uid.new_doc_id(d) d.embedding.CopyFrom(array2pb(e4)) return [doc1, doc2]
def input_fn(): doc1 = jina_pb2.Document() doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1' doc1.id = uid.new_doc_id(doc1) doc2 = jina_pb2.Document() doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2' doc2.id = uid.new_doc_id(doc2) doc3 = jina_pb2.Document() doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3' doc3.id = uid.new_doc_id(doc3) return [doc1, doc2, doc3]
def doc_with_multimodal_chunks(embeddings): doc = jina_pb2.DocumentProto() chunk1 = doc.chunks.add() chunk2 = doc.chunks.add() chunk3 = doc.chunks.add() chunk1.modality = 'visual1' chunk2.modality = 'visual2' chunk3.modality = 'textual' chunk1.id = uid.new_doc_id(chunk1) chunk2.id = uid.new_doc_id(chunk2) chunk3.id = uid.new_doc_id(chunk3) NdArray(chunk1.embedding).value = embeddings[0] NdArray(chunk2.embedding).value = embeddings[1] NdArray(chunk3.embedding).value = embeddings[2] return doc
def doc_with_multimodal_chunks_wrong(embeddings): doc = jina_pb2.Document() chunk1 = doc.chunks.add() chunk2 = doc.chunks.add() chunk3 = doc.chunks.add() chunk1.modality = 'visual' chunk2.modality = 'visual' chunk3.modality = 'textual' chunk1.id = uid.new_doc_id(chunk1) chunk2.id = uid.new_doc_id(chunk2) chunk3.id = uid.new_doc_id(chunk3) GenericNdArray(chunk1.embedding).value = embeddings[0] GenericNdArray(chunk2.embedding).value = embeddings[1] GenericNdArray(chunk3.embedding).value = embeddings[2] return doc
def doc_with_multimodal_chunks_wrong(embeddings): doc = jina_pb2.Document() chunk1 = doc.chunks.add() chunk2 = doc.chunks.add() chunk3 = doc.chunks.add() chunk1.modality = 'visual' chunk2.modality = 'visual' chunk3.modality = 'textual' chunk1.id = uid.new_doc_id(chunk1) chunk2.id = uid.new_doc_id(chunk2) chunk3.id = uid.new_doc_id(chunk3) chunk1.embedding.CopyFrom(array2pb(embeddings[0])) chunk2.embedding.CopyFrom(array2pb(embeddings[1])) chunk3.embedding.CopyFrom(array2pb(embeddings[2])) return doc
def test_segment_driver(): valid_doc = jina_pb2.Document() valid_doc.id = uid.new_doc_id(valid_doc) valid_doc.text = 'valid' valid_doc.length = 2 valid_doc.mime_type = 'image/png' driver = SimpleSegmentDriver() executor = MockSegmenter() driver.attach(executor=executor, pea=None) driver._apply(valid_doc) assert valid_doc.length == 2 assert valid_doc.chunks[0].tags['id'] == 3 assert valid_doc.chunks[0].parent_id == valid_doc.id assert valid_doc.chunks[0].blob == array2pb(np.array([0.0, 0.0, 0.0])) assert valid_doc.chunks[0].weight == 0 assert valid_doc.chunks[0].length == 3 assert valid_doc.chunks[0].mime_type == 'text/plain' assert valid_doc.chunks[1].tags['id'] == 4 assert valid_doc.chunks[1].parent_id == valid_doc.id assert valid_doc.chunks[1].blob == array2pb(np.array([1.0, 1.0, 1.0])) assert valid_doc.chunks[1].weight == 1 assert valid_doc.chunks[1].length == 3 assert valid_doc.chunks[1].mime_type == 'image/png' assert valid_doc.chunks[2].tags['id'] == 5 assert valid_doc.chunks[2].parent_id == valid_doc.id assert valid_doc.chunks[2].blob == array2pb(np.array([2.0, 2.0, 2.0])) assert valid_doc.chunks[2].weight == 2 assert valid_doc.chunks[2].length == 3 assert valid_doc.chunks[2].mime_type == 'image/png'
def random_docs(num_docs, chunks_per_doc=5, embed_dim=10, jitter=1): c_id = 3 * num_docs # avoid collision with docs for j in range(num_docs): d = jina_pb2.DocumentProto() d.tags['id'] = j d.text = b'hello world' NdArray(d.embedding).value = np.random.random([embed_dim + np.random.randint(0, jitter)]) d.id = uid.new_doc_id(d) for k in range(chunks_per_doc): c = d.chunks.add() c.text = 'i\'m chunk %d from doc %d' % (c_id, j) NdArray(c.embedding).value = np.random.random([embed_dim + np.random.randint(0, jitter)]) c.tags['id'] = c_id c.tags['parent_id'] = j c_id += 1 c.parent_id = d.id c.id = uid.new_doc_id(c) yield d
def random_docs(num_docs, chunks_per_doc=5, embed_dim=10, jitter=1): c_id = 3 * num_docs # avoid collision with docs for j in range(num_docs): d = jina_pb2.Document() d.tags['id'] = j d.text = b'hello world doc id %d' % j d.embedding.CopyFrom( array2pb( np.random.random([embed_dim + np.random.randint(0, jitter)]))) d.id = uid.new_doc_id(d) yield d
def test_broken_document(): driver = SimpleSegmentDriver() executor = MockSegmenter() driver.attach(executor=executor, pea=None) invalid_doc = jina_pb2.Document() invalid_doc.id = uid.new_doc_id(invalid_doc) invalid_doc.text = 'invalid' invalid_doc.length = 2 assert invalid_doc.length == 2 with pytest.raises(AttributeError): driver._apply(invalid_doc)
def documents(): docs = [] # doc: 1 # doc: 2 # doc: 3 # doc: 4 # doc: 5 for idx in range(5): doc = jina_pb2.Document() doc.text = str(idx + 1) doc.id = uid.new_doc_id(doc) docs.append(doc) return docs