def random_docs(num_docs, chunks_per_doc=5, embed_dim=10, jitter=1) -> Iterator['DocumentProto']: warnings.warn( 'since 0.7.11 the introduce of Document primitive type, this ' 'fake-doc generator has been depreciated. Use "random_docs_new_api" instead', DeprecationWarning) c_id = 3 * num_docs # avoid collision with docs for j in range(num_docs): d = jina_pb2.DocumentProto() d.tags['id'] = j d.text = b'hello world' NdArray(d.embedding).value = np.random.random( [embed_dim + np.random.randint(0, jitter)]) d.id = uid.new_doc_id(d) for k in range(chunks_per_doc): c = d.chunks.add() c.text = 'i\'m chunk %d from doc %d' % (c_id, j) NdArray(c.embedding).value = np.random.random( [embed_dim + np.random.randint(0, jitter)]) c.tags['id'] = c_id c.tags['parent_id'] = j c_id += 1 c.parent_id = d.id c.id = uid.new_doc_id(c) yield d
def random_queries(num_docs, chunks_per_doc=5): for j in range(num_docs): d = jina_pb2.DocumentProto() d.id = uid.new_doc_id(d) for k in range(chunks_per_doc): dd = d.chunks.add() dd.id = uid.new_doc_id(dd) # dd.id = k + 1 # 1-indexed yield d
def input_fn(): doc1 = DocumentProto() NdArray(doc1.embedding).value = e1 c = doc1.chunks.add() NdArray(c.embedding).value = e2 c.id = uid.new_doc_id(c) doc2 = DocumentProto() NdArray(doc2.embedding).value = e3 d = doc2.chunks.add() d.id = uid.new_doc_id(d) NdArray(d.embedding).value = e4 return [doc1, doc2]
def query_generator(image_paths, text_queries): for image_path, text in zip(image_paths, text_queries): doc = jina_pb2.DocumentProto() chunk1 = doc.chunks.add() chunk2 = doc.chunks.add() chunk1.modality = 'image' chunk2.modality = 'text' chunk1.id = uid.new_doc_id(chunk1) chunk2.id = uid.new_doc_id(chunk2) with open(image_path, 'rb') as fp: chunk1.buffer = fp.read() chunk2.text = text yield doc
def input_fn(): doc1 = jina_pb2.DocumentProto() doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1' doc1.id = uid.new_doc_id(doc1) doc2 = jina_pb2.DocumentProto() doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2' doc2.id = uid.new_doc_id(doc2) doc3 = jina_pb2.DocumentProto() doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3' doc3.id = uid.new_doc_id(doc3) return [doc1, doc2, doc3]
def doc_with_multimodal_chunks(embeddings): doc = jina_pb2.DocumentProto() chunk1 = doc.chunks.add() chunk2 = doc.chunks.add() chunk3 = doc.chunks.add() chunk1.modality = 'visual1' chunk2.modality = 'visual2' chunk3.modality = 'textual' chunk1.id = uid.new_doc_id(chunk1) chunk2.id = uid.new_doc_id(chunk2) chunk3.id = uid.new_doc_id(chunk3) NdArray(chunk1.embedding).value = embeddings[0] NdArray(chunk2.embedding).value = embeddings[1] NdArray(chunk3.embedding).value = embeddings[2] return doc
def test_segment_driver(): valid_doc = jina_pb2.DocumentProto() valid_doc.id = uid.new_doc_id(valid_doc) valid_doc.text = 'valid' valid_doc.length = 2 valid_doc.mime_type = 'image/png' driver = SimpleSegmentDriver() executor = MockSegmenter() driver.attach(executor=executor, pea=None) driver._apply_all([valid_doc]) assert valid_doc.length == 2 assert valid_doc.chunks[0].tags['id'] == 3 assert valid_doc.chunks[0].parent_id == valid_doc.id np.testing.assert_equal(NdArray(valid_doc.chunks[0].blob).value, np.array([0.0, 0.0, 0.0])) assert valid_doc.chunks[0].weight == 0 assert valid_doc.chunks[0].length == 3 assert valid_doc.chunks[0].mime_type == 'text/plain' assert valid_doc.chunks[1].tags['id'] == 4 assert valid_doc.chunks[1].parent_id == valid_doc.id np.testing.assert_equal(NdArray(valid_doc.chunks[1].blob).value, np.array([1.0, 1.0, 1.0])) assert valid_doc.chunks[1].weight == 1 assert valid_doc.chunks[1].length == 3 assert valid_doc.chunks[1].mime_type == 'image/png' assert valid_doc.chunks[2].tags['id'] == 5 assert valid_doc.chunks[2].parent_id == valid_doc.id np.testing.assert_equal(NdArray(valid_doc.chunks[2].blob).value, np.array([2.0, 2.0, 2.0])) assert valid_doc.chunks[2].weight == 2 assert valid_doc.chunks[2].length == 3 assert valid_doc.chunks[2].mime_type == 'image/png'
def random_docs(num_docs, chunks_per_doc=5, embed_dim=10, jitter=1): c_id = 3 * num_docs # avoid collision with docs for j in range(num_docs): d = jina_pb2.DocumentProto() d.tags['id'] = j d.text = b'hello world' NdArray(d.embedding).value = np.random.random([embed_dim + np.random.randint(0, jitter)]) d.id = uid.new_doc_id(d) for k in range(chunks_per_doc): c = d.chunks.add() c.text = 'i\'m chunk %d from doc %d' % (c_id, j) NdArray(c.embedding).value = np.random.random([embed_dim + np.random.randint(0, jitter)]) c.tags['id'] = c_id c.tags['parent_id'] = j c_id += 1 c.parent_id = d.id c.id = uid.new_doc_id(c) yield d
def test_broken_document(): driver = SimpleSegmentDriver() executor = MockSegmenter() driver.attach(executor=executor, pea=None) invalid_doc = jina_pb2.DocumentProto() invalid_doc.id = uid.new_doc_id(invalid_doc) invalid_doc.text = 'invalid' invalid_doc.length = 2 assert invalid_doc.length == 2 with pytest.raises(AttributeError): driver._apply_all([invalid_doc])
def documents(): docs = [] # doc: 1 # doc: 2 # doc: 3 # doc: 4 # doc: 5 for idx in range(5): doc = jina_pb2.DocumentProto() doc.text = str(idx + 1) doc.id = uid.new_doc_id(doc) docs.append(doc) return docs