def index_documents(): """Index Documents: doc: tag__id = 0 tag__dummy_score = 0 embedding = 0 doc: tag__id = 1 tag__dummy_score = -1 embedding = 1 doc: tag__id = 2 tag__dummy_score = -2 embedding = 2 """ doc0 = jina_pb2.DocumentProto() doc0.tags['id'] = '0' doc0.tags['dummy_score'] = 0 NdArray(doc0.embedding).value = np.array([0]) doc1 = jina_pb2.DocumentProto() doc1.tags['id'] = '1' doc1.tags['dummy_score'] = -1 NdArray(doc1.embedding).value = np.array([1]) doc2 = jina_pb2.DocumentProto() doc2.tags['id'] = '2' doc2.tags['dummy_score'] = -2 NdArray(doc2.embedding).value = np.array([2]) return [doc0, doc1, doc2]
def test_docgroundtruth_pair(): def add_matches(doc: jina_pb2.DocumentProto, num_matches): for idx in range(num_matches): match = doc.matches.add() match.adjacency = doc.adjacency + 1 def add_chunks(doc: jina_pb2.DocumentProto, num_chunks): for idx in range(num_chunks): chunk = doc.chunks.add() chunk.granularity = doc.granularity + 1 doc = jina_pb2.DocumentProto() gt = jina_pb2.DocumentProto() add_matches(doc, 3) add_matches(gt, 3) add_chunks(doc, 3) add_chunks(gt, 3) pair = DocGroundtruthPair(doc, gt) j = 0 for chunk_pair in pair.chunks: assert chunk_pair.doc.granularity == 1 assert chunk_pair.groundtruth.granularity == 1 j += 1 k = 0 for match_pair in pair.matches: assert match_pair.doc.adjacency == 1 assert match_pair.groundtruth.adjacency == 1 k += 1 assert j == 3 assert k == 3
def test_flow_with_modalities(tmpdir): os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE'] = str(tmpdir) def input_fn(): doc1 = jina_pb2.DocumentProto() doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1' doc1.id = uid.new_doc_id(doc1) doc2 = jina_pb2.DocumentProto() doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2' doc2.id = uid.new_doc_id(doc2) doc3 = jina_pb2.DocumentProto() doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3' doc3.id = uid.new_doc_id(doc3) return [doc1, doc2, doc3] flow = Flow().add(name='crafter', uses='!MockSegmenter'). \ add(name='encoder1', uses=str(cur_dir / 'yaml' / 'mockencoder-mode1.yml')). \ add(name='indexer1', uses=str(cur_dir / 'yaml' / 'numpy-indexer-1.yml'), needs=['encoder1']). \ add(name='encoder2', uses=str(cur_dir / 'yaml' / 'mockencoder-mode2.yml'), needs=['crafter']). \ add(name='indexer2', uses=str(cur_dir / 'yaml' / 'numpy-indexer-2.yml')). \ join(['indexer1', 'indexer2']) with flow: flow.index(input_fn=input_fn, override_doc_id=False) with open(tmpdir.join('vec1.gz'), 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal(result, np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])) with open(tmpdir.join('vec2.gz'), 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal(result, np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])) chunkIndexer1 = BinaryPbIndexer.load(tmpdir.join('kvidx1.bin')) assert chunkIndexer1.size == 3 d_id = list(chunkIndexer1.query_handler.header.keys())[0] query_doc = jina_pb2.DocumentProto() query_doc.ParseFromString(chunkIndexer1.query(d_id)) assert query_doc.text == 'title: this is mode1 from doc1' assert query_doc.modality == 'mode1' chunkIndexer2 = BinaryPbIndexer.load(tmpdir.join('kvidx2.bin')) assert chunkIndexer2.size == 3 d_id = list(chunkIndexer2.query_handler.header.keys())[0] query_doc = jina_pb2.DocumentProto() query_doc.ParseFromString(chunkIndexer2.query(d_id)) assert query_doc.text == ' body: this is mode2 from doc1' assert query_doc.modality == 'mode2' del os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE']
def create_documents_to_craft(): doc1 = jina_pb2.DocumentProto() # doc1.id = 1 doc1.text = 'valid' doc2 = jina_pb2.DocumentProto() # doc2.id = 2 doc2.text = 'invalid' return [doc1, doc2]
def random_docs_to_chunk(): d1 = jina_pb2.DocumentProto() d1.tags['id'] = 1 d1.text = 'chunk1 chunk2' yield d1 d2 = jina_pb2.DocumentProto() d2.tags['id'] = 1 d2.text = 'chunk3' yield d2
def ground_truth_pairs(): num_docs = 10 pairs = [] for idx in range(num_docs): doc = jina_pb2.DocumentProto() gt = jina_pb2.DocumentProto() NdArray(doc.embedding).value = np.array([1, 1]) NdArray(gt.embedding).value = np.array([2, 2]) pairs.append(DocGroundtruthPair(doc=doc, groundtruth=gt)) return pairs
def random_docs_with_tags(): d1 = jina_pb2.DocumentProto() d1.tags['id'] = 1 d1.text = 'a' d1.tags.update({'id': 1}) yield d1 d2 = jina_pb2.DocumentProto() d2.tags['id'] = 2 d2.tags.update({'id': 2}) d2.text = 'b' yield d2
def input_fn(): doc1 = jina_pb2.DocumentProto() doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1' doc1.id = uid.new_doc_id(doc1) doc2 = jina_pb2.DocumentProto() doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2' doc2.id = uid.new_doc_id(doc2) doc3 = jina_pb2.DocumentProto() doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3' doc3.id = uid.new_doc_id(doc3) return [doc1, doc2, doc3]
def input_fn(): doc1 = jina_pb2.DocumentProto() doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1' doc1.id = UniqueId(1) doc2 = jina_pb2.DocumentProto() doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2' doc2.id = UniqueId(2) doc3 = jina_pb2.DocumentProto() doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3' doc3.id = UniqueId(3) return [doc1, doc2, doc3]
def input_function(): doc1 = jina_pb2.DocumentProto() doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1' doc1.id = '1' doc2 = jina_pb2.DocumentProto() doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2' doc2.id = '2' doc3 = jina_pb2.DocumentProto() doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3' doc3.id = '3' return [doc1, doc2, doc3]
def index_groundtruth(): """Index Groundtruth: doc: id = 00 tag__groundtruth = True text = aa doc: id = 01 tag__groundtruth = True text = aa doc: id = 02 tag__groundtruth = True text = aa ... we will not have groundtruth for id 5, 10, 50 """ docs = [] for idx in range(0, 100): doc = jina_pb2.DocumentProto() doc.id = f'{idx}' # Invalid ids if odd length https://github.com/jina-ai/jina/issues/1125 if len(doc.id) % 2 != 0: doc.id = f'0{doc.id}' doc.tags['groundtruth'] = True doc.text = 'aa' if idx not in (5, 10, 50): docs.append(doc) return docs
def test_segment_driver(): valid_doc = jina_pb2.DocumentProto() valid_doc.id = uid.new_doc_id(valid_doc) valid_doc.text = 'valid' valid_doc.length = 2 valid_doc.mime_type = 'image/png' driver = SimpleSegmentDriver() executor = MockSegmenter() driver.attach(executor=executor, pea=None) driver._apply_all([valid_doc]) assert valid_doc.length == 2 assert valid_doc.chunks[0].tags['id'] == 3 assert valid_doc.chunks[0].parent_id == valid_doc.id np.testing.assert_equal(NdArray(valid_doc.chunks[0].blob).value, np.array([0.0, 0.0, 0.0])) assert valid_doc.chunks[0].weight == 0 assert valid_doc.chunks[0].length == 3 assert valid_doc.chunks[0].mime_type == 'text/plain' assert valid_doc.chunks[1].tags['id'] == 4 assert valid_doc.chunks[1].parent_id == valid_doc.id np.testing.assert_equal(NdArray(valid_doc.chunks[1].blob).value, np.array([1.0, 1.0, 1.0])) assert valid_doc.chunks[1].weight == 1 assert valid_doc.chunks[1].length == 3 assert valid_doc.chunks[1].mime_type == 'image/png' assert valid_doc.chunks[2].tags['id'] == 5 assert valid_doc.chunks[2].parent_id == valid_doc.id np.testing.assert_equal(NdArray(valid_doc.chunks[2].blob).value, np.array([2.0, 2.0, 2.0])) assert valid_doc.chunks[2].weight == 2 assert valid_doc.chunks[2].length == 3 assert valid_doc.chunks[2].mime_type == 'image/png'
def random_docs(num_docs, chunks_per_doc=5, embed_dim=10, jitter=1) -> Iterator['DocumentProto']: warnings.warn( 'since 0.7.11 the introduce of Document primitive type, this ' 'fake-doc generator has been depreciated. Use "random_docs_new_api" instead', DeprecationWarning) c_id = 3 * num_docs # avoid collision with docs for j in range(num_docs): d = jina_pb2.DocumentProto() d.tags['id'] = j d.text = b'hello world' NdArray(d.embedding).value = np.random.random( [embed_dim + np.random.randint(0, jitter)]) d.id = uid.new_doc_id(d) for k in range(chunks_per_doc): c = d.chunks.add() c.text = 'i\'m chunk %d from doc %d' % (c_id, j) NdArray(c.embedding).value = np.random.random( [embed_dim + np.random.randint(0, jitter)]) c.tags['id'] = c_id c.tags['parent_id'] = j c_id += 1 c.parent_id = d.id c.id = uid.new_doc_id(c) yield d
def test_queryset_with_struct(random_workspace, mocker): total_docs = 4 docs = [] for doc_id in range(total_docs): doc = jina_pb2.DocumentProto() doc.text = f'I am doc{doc_id}' NdArray(doc.embedding).value = np.array([doc_id]) doc.tags['label'] = f'label{doc_id % 2 + 1}' docs.append(doc) f = (Flow() .add(uses='- !FilterQL | {lookups: {tags__label__in: [label1, label2]}, traversal_paths: [r]}')) def validate_all_docs(resp): assert len(resp.docs) == total_docs def validate_label2_docs(resp): assert len(resp.docs) == total_docs / 2 mock1 = mocker.Mock() mock2 = mocker.Mock() with f: # keep all the docs f.index(docs, on_done=mock1) # keep only the docs with label2 qs = QueryLang({'name': 'FilterQL', 'priority': 1, 'parameters': {'lookups': {'tags__label': 'label2'}, 'traversal_paths': ['r']}}) f.index(docs, queryset=qs, on_done=mock2) mock1.assert_called_once() validate_callback(mock1, validate_all_docs) mock2.assert_called_once() validate_callback(mock2, validate_label2_docs)
def test_queryset_with_struct(random_workspace): total_docs = 4 docs = [] for doc_id in range(total_docs): doc = jina_pb2.DocumentProto() doc.text = f'I am doc{doc_id}' NdArray(doc.embedding).value = np.array([doc_id]) doc.tags['label'] = f'label{doc_id % 2 + 1}' docs.append(doc) f = (Flow() .add(uses='- !FilterQL | {lookups: {tags__label__in: [label1, label2]}, traversal_paths: [r]}')) def validate_all_docs(resp): assert len(resp.docs) == total_docs def validate_label2_docs(resp): assert len(resp.docs) == total_docs / 2 with f: # keep all the docs f.index(docs, output_fn=validate_all_docs, callback_on='body') # keep only the docs with label2 qs = jina_pb2.QueryLangProto(name='FilterQL', priority=1) qs.parameters['lookups'] = {'tags__label': 'label2'} qs.parameters['traversal_paths'] = ['r'] f.index(docs, queryset=qs, output_fn=validate_label2_docs, callback_on='body')
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) doc1 = jina_pb2.DocumentProto() doc1.id = '01' doc1.tags['groundtruth'] = True doc2 = jina_pb2.DocumentProto() doc2.id = '02' doc2.tags['groundtruth'] = True doc4 = jina_pb2.DocumentProto() doc4.id = '04' doc4.tags['groundtruth'] = True self.db = { uid.id2hash(doc1.id): doc1.SerializeToString(), uid.id2hash(doc2.id): doc2.SerializeToString(), uid.id2hash(doc4.id): doc4.SerializeToString() }
def create_chunk_chunk_matches_to_score(): # doc: (id: 100, granularity=0) # |- chunk: (id: 101, granularity=1) # |- chunks: (id: 10) # | |- matches: (id: 11, parent_id: 1, score.value: 2), # | |- matches: (id: 12, parent_id: 1, score.value: 3), # |- chunks: (id: 20) # |- matches: (id: 21, parent_id: 2, score.value: 4), # |- matches: (id: 22, parent_id: 2, score.value: 5) doc = jina_pb2.DocumentProto() doc.id = '100' doc.granularity = 0 chunk = doc.chunks.add() chunk.id = '101' chunk.parent_id = doc.id chunk.granularity = doc.granularity + 1 num_matches = 2 for parent_id in range(1, 3): chunk_chunk = chunk.chunks.add() chunk_chunk.id = str(parent_id * 10) chunk_chunk.parent_id = str(parent_id) chunk_chunk.granularity = chunk.granularity + 1 for score_value in range(parent_id * 2, parent_id * 2 + num_matches): match = chunk_chunk.matches.add() match.parent_id = str(parent_id) match.score.value = score_value match.score.ref_id = chunk_chunk.id match.id = str(10 * parent_id + score_value) match.length = 4 return Document(doc)
def create_document_to_score(): # doc: 1 # |- chunk: 2 # | |- matches: (id: 4, parent_id: 40, score.value: 4), # | |- matches: (id: 5, parent_id: 50, score.value: 5), # | # |- chunk: 3 # |- matches: (id: 6, parent_id: 60, score.value: 6), # |- matches: (id: 7, parent_id: 70, score.value: 7) doc = jina_pb2.DocumentProto() doc.id = '1' * 16 for c in range(2): chunk = doc.chunks.add() chunk_id = str(c + 2) chunk.id = chunk_id * 16 for m in range(2): match = chunk.matches.add() match_id = 2 * int(chunk_id) + m match.id = str(match_id) * 16 parent_id = 10 * int(match_id) match.parent_id = str(parent_id) * 8 match.length = int(match_id) # to be used by MaxRanker and MinRanker match.score.ref_id = chunk.id match.score.value = int(match_id) return Document(doc)
def create_documents_to_encode(num_docs): docs = [] for idx in range(num_docs): doc = jina_pb2.DocumentProto() NdArray(doc.blob).value = np.array([idx]) docs.append(doc) return docs
def test_shelf_in_flow(uses): m1 = used_memory() # shelve does not support embed > 1000?? # _dbm.error: cannot add item to database # HASH: Out of overflow pages. Increase page size docs = random_docs(10000, embed_dim=1000) f = Flow(callback_on='body').add(uses=os.path.join(cur_dir, uses)) with f: f.index(docs) m2 = used_memory() d = jina_pb2.DocumentProto() def validate(req): m4 = used_memory() print( f'before: {m1}, after index: {m2}, after loading: {m3} after searching {m4}' ) with f: m3 = used_memory() f.search([d], output_fn=validate) shutil.rmtree('test-workspace', ignore_errors=False, onerror=None)
def multimodal_all_types_documents(): docs = [] for idx in range(0, NUM_DOCS): """ doc - idx | | - chunk - embedding [idx, idx] - modality1 | - chunk - blob [idx, idx, idx] - modality2 | - chunk - text 'modality3' - modality3 -> Inside multimodal encoder will be encoded into [3, 3] | - chunk - buffer b'modality4' - modality4 -> Inside multimodal encoder will be encoded into [4, 4] Result: doc - idx - embedding [idx, idx, idx, idx, idx, 3, 3, 4, 4] """ doc = jina_pb2.DocumentProto() doc.text = f'{idx}' for modality in ['modality1', 'modality2', 'modality3', 'modality4']: chunk = doc.chunks.add() chunk.modality = modality if modality == 'modality1': NdArray(chunk.embedding).value = np.array([idx, idx]) elif modality == 'modality2': NdArray(chunk.blob).value = np.array([idx, idx, idx]) elif modality == 'modality3': chunk.text = 'modality3' elif modality == 'modality4': chunk.buffer = 'modality4'.encode() docs.append(doc) return docs
def random_docs(num_docs): for j in range(1, num_docs + 1): doc = jina_pb2.DocumentProto() doc.text = f'i\'m dummy doc {j}' doc.offset = 1000 doc.tags['id'] = 1000 # this will be ignored doc.mime_type = 'mime_type' yield doc
def random_docs(num_docs, embed_dim=10, jitter=1): for j in range(num_docs): d = jina_pb2.DocumentProto() d.tags['id'] = j d.text = b'hello' NdArray(d.embedding).value = np.random.random( [embed_dim + np.random.randint(0, jitter)]) yield d
def test_lazy_append_access(): reqs = (Request(r.SerializeToString(), EnvelopeProto()) for r in _generate(random_docs(10))) for r in reqs: assert not r.is_used # write access r.train r.docs.append(jina_pb2.DocumentProto()) # now it is read assert r.is_used
def random_queries(num_docs, chunks_per_doc=5): for j in range(num_docs): d = jina_pb2.DocumentProto() d.id = UniqueId(j) for k in range(chunks_per_doc): dd = d.chunks.add() dd.id = UniqueId(num_docs + j * chunks_per_doc + k) yield d
def ground_truth_pairs(): num_docs = 10 def add_matches(doc: jina_pb2.DocumentProto, num_matches): for idx in range(num_matches): match = doc.matches.add() match.tags['id'] = idx match.score.value = idx pairs = [] for idx in range(num_docs): doc = jina_pb2.DocumentProto() gt = jina_pb2.DocumentProto() add_matches(doc, num_docs) add_matches(gt, num_docs) pairs.append(DocGroundtruthPair(doc=doc, groundtruth=gt)) return pairs
def random_queries(num_docs, chunks_per_doc=5): for j in range(num_docs): d = jina_pb2.DocumentProto() d.id = uid.new_doc_id(d) for k in range(chunks_per_doc): dd = d.chunks.add() dd.id = uid.new_doc_id(dd) # dd.id = k + 1 # 1-indexed yield d
def create(self): gt = jina_pb2.DocumentProto() if field_type == 'text': gt.text = 'aaaa' elif field_type == 'buffer': gt.buffer = b'\x01\x02\x03\04' elif field_type == 'blob': NdArray(gt.blob).value = np.array([1, 1, 1, 1]) return gt
def random_docs_with_chunks_and_matches(num_docs): # doc |- chunk |- chunk # | |- chunk # | |- match | - chunk # | - chunk # | |- match # |- chunk # |- chunk # |- match | - chunk # | - chunk docs = [] for j in range(num_docs): d = jina_pb2.DocumentProto() d.granularity = 0 d.tags['id'] = j d.text = 'hello world' d.uri = 'doc://' for c in range(10): dc = d.chunks.add() dc.text = 'chunk to hello world' dc.granularity = d.granularity + 1 dc.uri = 'doc://chunk' dc.tags['id'] = c for cc in range(10): dcc = dc.chunks.add() dcc.text = 'nested chunk to chunk' dcc.uri = 'doc://chunk/chunk' dcc.tags['id'] = cc dcc.granularity = dc.granularity + 1 for m in range(10): cm = dc.matches.add() cm.text = 'match to chunk to hello-world' cm.uri = 'doc://chunk/match' cm.tags['id'] = m cm.granularity = dc.granularity for mc in range(10): cmc = cm.chunks.add() cmc.text = 'chunk to match to chunk to hello-world' cmc.uri = 'doc://chunk/match/chunk' cmc.tags['id'] = mc cmc.granularity = cm.granularity + 1 for m in range(10): dm = d.matches.add() dm.text = 'match to hello-world' dm.uri = 'doc://match' dm.tags['id'] = m dm.granularity = d.granularity for c in range(10): dmc = dm.chunks.add() dmc.text = 'chunk to match to hello-world' dmc.uri = 'doc://match/chunk' dmc.tags['id'] = m dmc.granularity = dm.granularity + 1 docs.append(d) return DocumentArray(docs)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) doc1 = jina_pb2.DocumentProto() doc1.id = '1' NdArray(doc1.embedding).value = np.array([int(doc1.id)]) doc2 = jina_pb2.DocumentProto() doc2.id = '2' NdArray(doc2.embedding).value = np.array([int(doc2.id)]) doc3 = jina_pb2.DocumentProto() doc3.id = '3' NdArray(doc3.embedding).value = np.array([int(doc3.id)]) doc4 = jina_pb2.DocumentProto() doc4.id = '4' NdArray(doc4.embedding).value = np.array([int(doc4.id)]) self.db = { 1: doc1.SerializeToString(), 2: doc2.SerializeToString(), 3: doc3.SerializeToString(), 4: doc4.SerializeToString() }