def index_documents(): """Index Documents: doc: tag__id = 0 tag__dummy_score = 0 embedding = 0 doc: tag__id = 1 tag__dummy_score = -1 embedding = 1 doc: tag__id = 2 tag__dummy_score = -2 embedding = 2 """ doc0 = jina_pb2.Document() doc0.tags['id'] = '0' doc0.tags['dummy_score'] = 0 doc0.embedding.CopyFrom(array2pb(np.array([0]))) doc1 = jina_pb2.Document() doc1.tags['id'] = '1' doc1.tags['dummy_score'] = -1 doc1.embedding.CopyFrom(array2pb(np.array([1]))) doc2 = jina_pb2.Document() doc2.tags['id'] = '2' doc2.tags['dummy_score'] = -2 doc2.embedding.CopyFrom(array2pb(np.array([2]))) return [doc0, doc1, doc2]
def index_documents(): """Index Documents: doc: tag__id = 0 tag__dummy_score = 0 embedding = 0 doc: tag__id = 1 tag__dummy_score = -1 embedding = 1 doc: tag__id = 2 tag__dummy_score = -2 embedding = 2 """ doc0 = jina_pb2.Document() doc0.tags['id'] = '0' doc0.tags['dummy_score'] = 0 GenericNdArray(doc0.embedding).value = np.array([0]) doc1 = jina_pb2.Document() doc1.tags['id'] = '1' doc1.tags['dummy_score'] = -1 GenericNdArray(doc1.embedding).value = np.array([1]) doc2 = jina_pb2.Document() doc2.tags['id'] = '2' doc2.tags['dummy_score'] = -2 GenericNdArray(doc2.embedding).value = np.array([2]) return [doc0, doc1, doc2]
def input_index_data(num_docs=None, batch_size=8, dataset='f30k'): from dataset import get_data_loader captions = 'dataset_flickr30k.json' if dataset == 'f30k' else 'captions.txt' data_loader = get_data_loader(root=os.path.join(cur_dir, f'data/{dataset}/images'), captions=os.path.join( cur_dir, f'data/{dataset}/{captions}'), split='test', batch_size=batch_size, dataset_type=dataset) for i, (images, captions) in enumerate(data_loader): for image in images: document = jina_pb2.Document() document.buffer = image document.modality = 'image' document.mime_type = 'image/jpeg' yield document for caption in captions: document = jina_pb2.Document() document.text = caption document.modality = 'text' document.mime_type = 'plain/text' yield document if num_docs and (i + 1) * batch_size >= num_docs: break
def test_docgroundtruth_pair(): def add_matches(doc: jina_pb2.Document, num_matches): for idx in range(num_matches): match = doc.matches.add() match.adjacency = doc.adjacency + 1 def add_chunks(doc: jina_pb2.Document, num_chunks): for idx in range(num_chunks): chunk = doc.chunks.add() chunk.granularity = doc.granularity + 1 doc = jina_pb2.Document() gt = jina_pb2.Document() add_matches(doc, 3) add_matches(gt, 3) add_chunks(doc, 3) add_chunks(gt, 3) pair = DocGroundtruthPair(doc, gt) j = 0 for chunk_pair in pair.chunks: assert chunk_pair.doc.granularity == 1 assert chunk_pair.groundtruth.granularity == 1 j += 1 k = 0 for match_pair in pair.matches: assert match_pair.doc.adjacency == 1 assert match_pair.groundtruth.adjacency == 1 k += 1 assert j == 3 assert k == 3
def test_flow_with_modalities(tmpdir): os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE'] = str(tmpdir) def input_fn(): doc1 = jina_pb2.Document() doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1' doc1.id = uid.new_doc_id(doc1) doc2 = jina_pb2.Document() doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2' doc2.id = uid.new_doc_id(doc2) doc3 = jina_pb2.Document() doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3' doc3.id = uid.new_doc_id(doc3) return [doc1, doc2, doc3] flow = Flow().add(name='crafter', uses='!MockSegmenter'). \ add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml')). \ add(name='indexer1', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'), needs=['encoder1']). \ add(name='encoder2', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'), needs=['crafter']). \ add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml')). \ join(['indexer1', 'indexer2']) with flow: flow.index(input_fn=input_fn, override_doc_id=False) with open(tmpdir.join('vec1.gz'), 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])) with open(tmpdir.join('vec2.gz'), 'rb') as fp: result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3]) np.testing.assert_equal( result, np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])) chunkIndexer1 = BinaryPbIndexer.load(tmpdir.join('kvidx1.bin')) assert chunkIndexer1.size == 3 d_id = list(chunkIndexer1.query_handler.header.keys())[0] query_doc = jina_pb2.Document() query_doc.ParseFromString(chunkIndexer1.query(d_id)) assert query_doc.text == 'title: this is mode1 from doc1' assert query_doc.modality == 'mode1' chunkIndexer2 = BinaryPbIndexer.load(tmpdir.join('kvidx2.bin')) assert chunkIndexer2.size == 3 d_id = list(chunkIndexer2.query_handler.header.keys())[0] query_doc = jina_pb2.Document() query_doc.ParseFromString(chunkIndexer2.query(d_id)) assert query_doc.text == ' body: this is mode2 from doc1' assert query_doc.modality == 'mode2' del os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE']
def create_documents_to_craft(): doc1 = jina_pb2.Document() doc1.id = 1 doc1.text = 'valid' doc2 = jina_pb2.Document() doc2.id = 2 doc2.text = 'invalid' return [doc1, doc2]
def random_docs_to_chunk(): d1 = jina_pb2.Document() d1.id = 1 d1.text = 'chunk1 chunk2' yield d1 d2 = jina_pb2.Document() d2.id = 1 d2.text = 'chunk3' yield d2
def random_docs_with_chunks(num_docs): d1 = jina_pb2.Document() d1.id = 1 d1.text = 'chunk1 chunk2' yield d1 d2 = jina_pb2.Document() d2.id = 1 d2.text = 'chunk3' yield d2
def ground_truth_pairs(): num_docs = 10 pairs = [] for idx in range(num_docs): doc = jina_pb2.Document() gt = jina_pb2.Document() doc.embedding.CopyFrom(array2pb(np.array([1, 1]))) gt.embedding.CopyFrom(array2pb(np.array([2, 2]))) pairs.append(DocGroundtruthPair(doc=doc, groundtruth=gt)) return pairs
def create_documents_to_segment(): doc1 = jina_pb2.Document() doc1.id = 1 doc1.text = 'valid' doc1.length = 2 doc2 = jina_pb2.Document() doc2.id = 2 doc2.text = 'invalid' doc2.length = 2 return [doc1, doc2]
def ground_truth_pairs(): num_docs = 10 pairs = [] for idx in range(num_docs): doc = jina_pb2.Document() gt = jina_pb2.Document() GenericNdArray(doc.embedding).value = np.array([1, 1]) GenericNdArray(gt.embedding).value = np.array([2, 2]) pairs.append(DocGroundtruthPair(doc=doc, groundtruth=gt)) return pairs
def random_docs_with_tags(): d1 = jina_pb2.Document() d1.id = 1 d1.text = 'a' d1.tags.update({'id': 1}) yield d1 d2 = jina_pb2.Document() d2.id = 2 d2.tags.update({'id': 2}) d2.text = 'b' yield d2
def create_document_ground_truth_pairs(num_docs): def add_matches(doc: jina_pb2.Document, num_matches): for idx in range(num_matches): match = doc.matches.add() match.tags['id'] = idx pairs = [] for idx in range(num_docs): doc = jina_pb2.Document() gt = jina_pb2.Document() add_matches(doc, num_docs) add_matches(gt, num_docs) pairs.append(DocGroundtruthPair(doc=doc, groundtruth=gt)) return pairs
def input_fn(): doc1 = jina_pb2.Document() doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1' doc1.id = uid.new_doc_id(doc1) doc2 = jina_pb2.Document() doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2' doc2.id = uid.new_doc_id(doc2) doc3 = jina_pb2.Document() doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3' doc3.id = uid.new_doc_id(doc3) return [doc1, doc2, doc3]
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) doc1 = jina_pb2.Document() doc1.id = '1' doc1.embedding.CopyFrom(array2pb(np.array([int(doc1.id)]))) doc2 = jina_pb2.Document() doc2.id = '2' doc2.embedding.CopyFrom(array2pb(np.array([int(doc2.id)]))) doc3 = jina_pb2.Document() doc3.id = '3' doc3.embedding.CopyFrom(array2pb(np.array([int(doc3.id)]))) doc4 = jina_pb2.Document() doc4.id = '4' doc4.embedding.CopyFrom(array2pb(np.array([int(doc4.id)]))) self.db = {1: doc1, 2: doc2, 3: doc3, 4: doc4}
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) doc1 = jina_pb2.Document() doc1.id = 1 doc1.embedding.CopyFrom(array2pb(np.array([doc1.id]))) doc2 = jina_pb2.Document() doc2.id = 2 doc2.embedding.CopyFrom(array2pb(np.array([doc2.id]))) doc3 = jina_pb2.Document() doc3.id = 3 doc3.embedding.CopyFrom(array2pb(np.array([doc3.id]))) doc4 = jina_pb2.Document() doc4.id = 4 doc4.embedding.CopyFrom(array2pb(np.array([doc4.id]))) self.db = {1: doc1, 2: doc2, 3: doc3, 4: doc4}
def test_segment_driver(): valid_doc = jina_pb2.Document() valid_doc.id = 1 valid_doc.text = 'valid' valid_doc.length = 2 valid_doc.mime_type = 'image/png' driver = SimpleSegmentDriver(first_chunk_id=3) executor = MockSegmenter() driver.attach(executor=executor, pea=None) driver._apply(valid_doc) assert valid_doc.length == 2 assert valid_doc.chunks[0].id == 3 assert valid_doc.chunks[0].parent_id == valid_doc.id assert valid_doc.chunks[0].blob == array2pb(np.array([0.0, 0.0, 0.0])) assert valid_doc.chunks[0].weight == 0 assert valid_doc.chunks[0].length == 3 assert valid_doc.chunks[0].mime_type == 'text/plain' assert valid_doc.chunks[1].id == 4 assert valid_doc.chunks[1].parent_id == valid_doc.id assert valid_doc.chunks[1].blob == array2pb(np.array([1.0, 1.0, 1.0])) assert valid_doc.chunks[1].weight == 1 assert valid_doc.chunks[1].length == 3 assert valid_doc.chunks[1].mime_type == 'image/png' assert valid_doc.chunks[2].id == 5 assert valid_doc.chunks[2].parent_id == valid_doc.id assert valid_doc.chunks[2].blob == array2pb(np.array([2.0, 2.0, 2.0])) assert valid_doc.chunks[2].weight == 2 assert valid_doc.chunks[2].length == 3 assert valid_doc.chunks[2].mime_type == 'image/png'
def random_docs_with_matches(num_docs): docs = [] #matches are always in the same level depth as its match for j in range(num_docs): d = jina_pb2.Document() d.level_depth = 0 d.id = j d.text = 'hello world' d.uri = 'doc://' for m in range(10): dm = d.matches.add() dm.text = 'match to hello world' dm.level_depth = 0 dm.uri = 'doc://match' dm.id = m dm.score.ref_id = d.id for mm in range(10): dmm = dm.matches.add() dmm.text = 'nested match to match' dmm.uri = 'doc://match/match' dmm.id = mm dmm.score.ref_id = m dmm.level_depth = 0 docs.append(d) return docs
def multimodal_all_types_documents(): docs = [] for idx in range(0, NUM_DOCS): """ doc - idx | | - chunk - embedding [idx, idx] - modality1 | - chunk - blob [idx, idx, idx] - modality2 | - chunk - text 'modality3' - modality3 -> Inside multimodal encoder will be encoded into [3, 3] | - chunk - buffer b'modality4' - modality4 -> Inside multimodal encoder will be encoded into [4, 4] Result: doc - idx - embedding [idx, idx, idx, idx, idx, 3, 3, 4, 4] """ doc = jina_pb2.Document() doc.text = f'{idx}' for modality in ['modality1', 'modality2', 'modality3', 'modality4']: chunk = doc.chunks.add() chunk.modality = modality if modality == 'modality1': GenericNdArray(chunk.embedding).value = np.array([idx, idx]) elif modality == 'modality2': GenericNdArray(chunk.blob).value = np.array([idx, idx, idx]) elif modality == 'modality3': chunk.text = 'modality3' elif modality == 'modality4': chunk.buffer = 'modality4'.encode() docs.append(doc) return docs
def random_queries(num_docs, chunks_per_doc=5, embed_dim=10): for j in range(num_docs): d = jina_pb2.Document() for k in range(chunks_per_doc): dd = d.topk_results.add() dd.match_doc.doc_id = k yield d
def index_generator(num_doc, target): for j in range(num_doc): label_int = target['index-labels']['data'][j][0] d = jina_pb2.Document() d.blob.CopyFrom(array2pb((target['index']['data'][j]))) d.tags.update({'label': get_mapped_label(label_int)}) yield d
def test_queryset_with_struct(random_workspace): total_docs = 4 docs = [] for doc_id in range(total_docs): doc = jina_pb2.Document() doc.text = f'I am doc{doc_id}' doc.embedding.CopyFrom(array2pb(np.array([doc_id]))) doc.tags['label'] = f'label{doc_id%2 + 1}' docs.append(doc) f = (Flow().add( uses= '- !FilterQL | {lookups: {tags__label__in: [label1, label2]}, traversal_paths: [r]}' )) def validate_all_docs(resp): assert len(resp.docs) == total_docs def validate_label2_docs(resp): assert len(resp.docs) == total_docs / 2 with f: # keep all the docs f.index(docs, output_fn=validate_all_docs, callback_on_body=True) # keep only the docs with label2 qs = jina_pb2.QueryLang(name='FilterQL', priority=1) qs.parameters['lookups'] = {'tags__label': 'label2'} qs.parameters['traversal_paths'] = ['r'] f.index(docs, queryset=qs, output_fn=validate_label2_docs, callback_on_body=True)
def create_chunk_matches_to_score(): # doc: (id: 100, level_depth=0) # |- chunks: (id: 10) # | |- matches: (id: 11, parent_id: 1, score.value: 2, level_depth=1), # | |- matches: (id: 12, parent_id: 1, score.value: 3, level_depth=1), # |- chunks: (id: 20) # |- matches: (id: 21, parent_id: 2, score.value: 4, level_depth=1), # |- matches: (id: 22, parent_id: 2, score.value: 5, level_depth=1) doc = jina_pb2.Document() doc.id = 100 doc.level_depth = 0 num_matches = 2 for parent_id in range(1, 3): chunk = doc.chunks.add() chunk.id = parent_id * 10 chunk.level_depth = doc.level_depth + 1 for score_value in range(parent_id * 2, parent_id * 2 + num_matches): match = chunk.matches.add() match.level_depth = chunk.level_depth match.parent_id = parent_id match.score.value = score_value match.score.ref_id = chunk.id match.id = 10 * parent_id + score_value match.length = 4 return doc
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) doc1 = jina_pb2.Document() doc1.id = '01' doc1.tags['groundtruth'] = True doc2 = jina_pb2.Document() doc2.id = '02' doc2.tags['groundtruth'] = True doc4 = jina_pb2.Document() doc4.id = '04' doc4.tags['groundtruth'] = True self.db = { uid.id2hash(doc1.id): doc1.SerializeToString(), uid.id2hash(doc2.id): doc2.SerializeToString(), uid.id2hash(doc4.id): doc4.SerializeToString() }
def create_chunk_chunk_matches_to_score(): # doc: (id: 100, granularity=0) # |- chunk: (id: 101, granularity=1) # |- chunks: (id: 10) # | |- matches: (id: 11, parent_id: 1, score.value: 2), # | |- matches: (id: 12, parent_id: 1, score.value: 3), # |- chunks: (id: 20) # |- matches: (id: 21, parent_id: 2, score.value: 4), # |- matches: (id: 22, parent_id: 2, score.value: 5) doc = jina_pb2.Document() doc.id = 100 doc.granularity = 0 chunk = doc.chunks.add() chunk.id = 101 chunk.granularity = doc.granularity + 1 num_matches = 2 for parent_id in range(1, 3): chunk_chunk = chunk.chunks.add() chunk_chunk.id = parent_id * 10 chunk_chunk.granularity = chunk.granularity + 1 for score_value in range(parent_id * 2, parent_id * 2 + num_matches): match = chunk_chunk.matches.add() match.parent_id = parent_id match.score.value = score_value match.score.ref_id = chunk_chunk.id match.id = 10 * parent_id + score_value match.length = 4 return doc
def create_documents_to_encode(num_docs): docs = [] for idx in range(num_docs): doc = jina_pb2.Document() doc.blob.CopyFrom(array2pb(np.array([idx]))) docs.append(doc) return docs
def random_docs(num_docs): vecs = np.random.random([num_docs, 2]) for j in range(num_docs): d = jina_pb2.Document() d.id = j d.embedding.CopyFrom(array2pb(vecs[j])) yield d
def random_queries(num_docs, chunks_per_doc=5): for j in range(num_docs): d = jina_pb2.Document() for k in range(chunks_per_doc): dd = d.add() dd.id = k + 1 # 1-indexed yield d
def search_generator(path: str, buffer: bytes): d = jina_pb2.Document() if buffer: d.buffer = buffer if path: d.uri = path yield d
def _create_Document(self, doc_id, text, weight, length): d = jina_pb2.Document() d.id = doc_id d.buffer = text.encode('utf8') d.weight = weight d.length = length return d