def multimodal_all_types_documents(): docs = [] for idx in range(0, NUM_DOCS): """ doc - idx | | - chunk - embedding [idx, idx] - modality1 | - chunk - blob [idx, idx, idx] - modality2 | - chunk - text 'modality3' - modality3 -> Inside multimodal encoder will be encoded into [3, 3] | - chunk - buffer b'modality4' - modality4 -> Inside multimodal encoder will be encoded into [4, 4] Result: doc - idx - embedding [idx, idx, idx, idx, idx, 3, 3, 4, 4] """ doc = jina_pb2.Document() doc.text = f'{idx}' for modality in ['modality1', 'modality2', 'modality3', 'modality4']: chunk = doc.chunks.add() chunk.modality = modality if modality == 'modality1': GenericNdArray(chunk.embedding).value = np.array([idx, idx]) elif modality == 'modality2': GenericNdArray(chunk.blob).value = np.array([idx, idx, idx]) elif modality == 'modality3': chunk.text = 'modality3' elif modality == 'modality4': chunk.buffer = 'modality4'.encode() docs.append(doc) return docs
def index_documents(): """Index Documents: doc: tag__id = 0 tag__dummy_score = 0 embedding = 0 doc: tag__id = 1 tag__dummy_score = -1 embedding = 1 doc: tag__id = 2 tag__dummy_score = -2 embedding = 2 """ doc0 = jina_pb2.Document() doc0.tags['id'] = '0' doc0.tags['dummy_score'] = 0 GenericNdArray(doc0.embedding).value = np.array([0]) doc1 = jina_pb2.Document() doc1.tags['id'] = '1' doc1.tags['dummy_score'] = -1 GenericNdArray(doc1.embedding).value = np.array([1]) doc2 = jina_pb2.Document() doc2.tags['id'] = '2' doc2.tags['dummy_score'] = -2 GenericNdArray(doc2.embedding).value = np.array([2]) return [doc0, doc1, doc2]
def test_array2pb(): # i don't understand why is this set? # os env should be available to that process-context only if 'JINA_ARRAY_QUANT' in os.environ: print(f'quant is on: {os.environ["JINA_ARRAY_QUANT"]}') del os.environ['JINA_ARRAY_QUANT'] d = GenericNdArray() d.value = e4 np.testing.assert_almost_equal(d.value, e4)
def ground_truth_pairs(): num_docs = 10 pairs = [] for idx in range(num_docs): doc = jina_pb2.Document() gt = jina_pb2.Document() GenericNdArray(doc.embedding).value = np.array([1, 1]) GenericNdArray(gt.embedding).value = np.array([2, 2]) pairs.append(DocGroundtruthPair(doc=doc, groundtruth=gt)) return pairs
def test_multimodal_driver(simple_multimodal_driver, mock_multimodal_encoder, doc_with_multimodal_chunks): simple_multimodal_driver.attach(executor=mock_multimodal_encoder, pea=None) simple_multimodal_driver._apply_all([doc_with_multimodal_chunks]) doc = doc_with_multimodal_chunks assert len(doc.chunks) == 3 visual1 = doc.chunks[0] visual2 = doc.chunks[1] textual = doc.chunks[2] assert GenericNdArray(doc.embedding).value.shape[0] == GenericNdArray(visual1.embedding).value.shape[0] + \ GenericNdArray(visual2.embedding).value.shape[0] + GenericNdArray(textual.embedding).value.shape[0]
def test_vectorsearch_driver_mock_indexer_with_fill(): doc = create_document_to_search() driver = SimpleVectorSearchDriver(top_k=2, fill_embedding=True) executor = MockIndexer() driver.attach(executor=executor, pea=None) driver._apply_all(doc.chunks) for chunk in doc.chunks: assert GenericNdArray(chunk.matches[0].embedding).value.shape == (7, ) assert GenericNdArray(chunk.matches[-1].embedding).value.shape == (7, ) assert GenericNdArray(chunk.matches[-1].embedding).value is not None
def validate(req): assert len(req.docs) == 2 assert GenericNdArray( req.docs[0].embedding).value.shape == (e1.shape[0] * 2, ) assert GenericNdArray( req.docs[1].embedding).value.shape == (e3.shape[0] * 2, ) # assert GenericNdArray(req.docs[0].chunks[0].embedding).value.shape == (e2.shape[0] * 2,) # assert GenericNdArray(req.docs[1].chunks[0].embedding).value.shape == (e4.shape[0] * 2,) np.testing.assert_almost_equal(GenericNdArray( req.docs[0].embedding).value, np.concatenate([e1, e1], axis=0), decimal=4)
def input_fn(): doc1 = Document() GenericNdArray(doc1.embedding).value = e1 c = doc1.chunks.add() GenericNdArray(c.embedding).value = e2 c.id = uid.new_doc_id(c) doc2 = Document() GenericNdArray(doc2.embedding).value = e3 d = doc2.chunks.add() d.id = uid.new_doc_id(d) GenericNdArray(d.embedding).value = e4 return [doc1, doc2]
def test_index_driver(): docs = create_documents_to_encode(10) driver = SimpleFillDriver() executor = MockIndexer() driver.attach(executor=executor, pea=None) assert len(docs) == 10 for doc in docs: assert GenericNdArray(doc.embedding).value is None driver._apply_all(docs) assert len(docs) == 10 for doc in docs: assert GenericNdArray(doc.embedding).value.shape == (5, )
def eval_request(): num_docs = 10 req = jina_pb2.Request.IndexRequest() for idx in range(num_docs): doc = req.docs.add() gt = req.groundtruths.add() chunk_doc = doc.chunks.add() chunk_gt = gt.chunks.add() chunk_doc.granularity = 1 chunk_gt.granularity = 1 GenericNdArray(chunk_doc.embedding).value = np.array([1, 1]) GenericNdArray(chunk_gt.embedding).value = np.array([2, 2]) return req
def test_multimodal_driver_with_shuffled_order(simple_multimodal_driver, mock_multimodal_encoder_shuffled, doc_with_multimodal_chunks): simple_multimodal_driver.attach(executor=mock_multimodal_encoder_shuffled, pea=None) simple_multimodal_driver._apply_all([doc_with_multimodal_chunks]) doc = doc_with_multimodal_chunks assert len(doc.chunks) == 3 visual1 = doc.chunks[2] visual2 = doc.chunks[0] textual = doc.chunks[1] control = np.concatenate([GenericNdArray(visual2.embedding).value, GenericNdArray(textual.embedding).value, GenericNdArray(visual1.embedding).value]) test = GenericNdArray(doc.embedding).value np.testing.assert_array_equal(control, test)
def get_output(req): np.random.seed(rseed) err = 0 for d in req.docs: recv = GenericNdArray(d.embedding).value send = np.random.random([embed_dim]) err += np.sum(np.abs(recv - send)) / embed_dim for c in d.chunks: recv = GenericNdArray(c.embedding).value send = np.random.random([embed_dim]) err += np.sum(np.abs(recv - send)) / embed_dim print(f'reconstruction error: {err / num_docs:.6f}')
def input_doc(): doc = jina_pb2.Document() doc.tags['id'] = 1 match0 = doc.matches.add() match0.tags['id'] = 10 match0.text = text GenericNdArray(match0.embedding).value = random_np_array match1 = doc.matches.add() match1.tags['id'] = 20 GenericNdArray(match1.blob).value = random_np_array match2 = doc.matches.add() match2.tags['id'] = 30 match2.buffer = buffer return doc
def input_doc(): doc = jina_pb2.Document() doc.tags['id'] = 1 chunk0 = doc.chunks.add() chunk0.tags['id'] = 10 chunk0.text = text GenericNdArray(chunk0.embedding).value = random_np_array chunk1 = doc.chunks.add() chunk1.tags['id'] = 20 GenericNdArray(chunk1.blob).value = random_np_array chunk2 = doc.chunks.add() chunk2.tags['id'] = 30 chunk2.buffer = buffer return doc
def test_vectorsearch_driver_mock_indexer_with_matches_on_chunks(): driver = SimpleKVSearchDriver(traversal_paths=('cm',)) executor = MockIndexer() driver.attach(executor=executor, pea=None) doc = create_document_to_search_with_matches_on_chunks() driver._traverse_apply([doc]) assert len(doc.chunks) == 1 chunk = doc.chunks[0] assert len(chunk.matches) == 3 for match in chunk.matches: assert GenericNdArray(match.embedding).value is not None embedding_array = GenericNdArray(match.embedding).value np.testing.assert_equal(embedding_array, np.array([int(match.id)]))
def doc_with_multimodal_chunks_wrong(embeddings): doc = jina_pb2.Document() chunk1 = doc.chunks.add() chunk2 = doc.chunks.add() chunk3 = doc.chunks.add() chunk1.modality = 'visual' chunk2.modality = 'visual' chunk3.modality = 'textual' chunk1.id = uid.new_doc_id(chunk1) chunk2.id = uid.new_doc_id(chunk2) chunk3.id = uid.new_doc_id(chunk3) GenericNdArray(chunk1.embedding).value = embeddings[0] GenericNdArray(chunk2.embedding).value = embeddings[1] GenericNdArray(chunk3.embedding).value = embeddings[2] return doc
def test_queryset_with_struct(random_workspace): total_docs = 4 docs = [] for doc_id in range(total_docs): doc = jina_pb2.Document() doc.text = f'I am doc{doc_id}' GenericNdArray(doc.embedding).value = np.array([doc_id]) doc.tags['label'] = f'label{doc_id % 2 + 1}' docs.append(doc) f = (Flow().add( uses= '- !FilterQL | {lookups: {tags__label__in: [label1, label2]}, traversal_paths: [r]}' )) def validate_all_docs(resp): assert len(resp.docs) == total_docs def validate_label2_docs(resp): assert len(resp.docs) == total_docs / 2 with f: # keep all the docs f.index(docs, output_fn=validate_all_docs, callback_on_body=True) # keep only the docs with label2 qs = jina_pb2.QueryLang(name='FilterQL', priority=1) qs.parameters['lookups'] = {'tags__label': 'label2'} qs.parameters['traversal_paths'] = ['r'] f.index(docs, queryset=qs, output_fn=validate_label2_docs, callback_on_body=True)
def test_as_blob_driver(): docs = list(random_docs(2)) driver = MockPrediction2DocBlobDriver() driver._traverse_apply(docs) for d in docs: assert GenericNdArray(d.blob).value.shape == (3, )
def test_request_generate_numpy_arrays(): input_array = np.random.random([10, 10]) req = _generate(data=input_array, batch_size=5) request = next(req) assert len(request.index.docs) == 5 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 5 assert GenericNdArray(doc.blob).value.shape == (10,) request = next(req) assert len(request.index.docs) == 5 for index, doc in enumerate(request.index.docs, 1): assert doc.length == 5 assert GenericNdArray(doc.blob).value.shape == (10,)
def create_documents_to_encode(num_docs): docs = [] for idx in range(num_docs): doc = jina_pb2.Document() GenericNdArray(doc.blob).value = np.array([idx]) docs.append(doc) return docs
def test_multimodal_driver_assert_one_chunk_per_modality(simple_multimodal_driver, mock_multimodal_encoder, doc_with_multimodal_chunks_wrong): simple_multimodal_driver.attach(executor=mock_multimodal_encoder, pea=None) simple_multimodal_driver._apply_all([doc_with_multimodal_chunks_wrong]) doc = doc_with_multimodal_chunks_wrong assert len(doc.chunks) == 3 # Document consider invalid to be encoded by the driver assert GenericNdArray(doc.embedding).value is None
def random_docs(num_docs, embed_dim=10, jitter=1): for j in range(num_docs): d = jina_pb2.Document() d.tags['id'] = j d.text = b'hello' GenericNdArray(d.embedding).value = np.random.random( [embed_dim + np.random.randint(0, jitter)]) yield d
def create(self): gt = jina_pb2.Document() if field_type == 'text': gt.text = 'aaaa' elif field_type == 'buffer': gt.buffer = b'\x01\x02\x03\04' elif field_type == 'blob': GenericNdArray(gt.blob).value = np.array([1, 1, 1, 1]) return gt
def doc_groundtruth_evaluation_pairs(): doc0 = jina_pb2.Document() GenericNdArray(doc0.embedding).value = np.array( [0]) # it will match 0 and 1 groundtruth0 = jina_pb2.Document() match0 = groundtruth0.matches.add() match0.tags['id'] = '0' match1 = groundtruth0.matches.add() match1.tags['id'] = '2' # top_k is set to 2 for VectorSearchDriver # expects as matches [0, 2] but given [0, 1] # Precision@1 = 100% # Precision@2 = 50% # Recall@1 = 100% # Recall@2 = 50% # expects as ranked [0, 2] but given [0, 1] # Precision@1 = 100% # Precision@2 = 50% # Recall@1 = 100% # Recall@2 = 50% doc1 = jina_pb2.Document() GenericNdArray(doc1.embedding).value = np.array( [2]) # it will match 2 and 1 groundtruth1 = jina_pb2.Document() match0 = groundtruth1.matches.add() match0.tags['id'] = '1' match1 = groundtruth1.matches.add() match1.tags['id'] = '2' # expects as matches [1, 2] but given [2, 1] # Precision@1 = 100% # Precision@2 = 100% # Recall@1 = 100% # Recall@2 = 100% # expects as ranked [1, 2] but given [2, 1] # Precision@1 = 100% # Precision@2 = 100% # Recall@1 = 100% # Recall@2 = 100% return [(doc0, groundtruth0), (doc1, groundtruth1)]
def test_vectorsearch_driver_mock_indexer_traverse_apply(): doc = create_document_to_search() driver = SimpleKVSearchDriver() executor = MockIndexer() driver.attach(executor=executor, pea=None) assert len(doc.chunks) == 5 for chunk in doc.chunks: assert GenericNdArray(chunk.embedding).value is None driver._traverse_apply(doc.chunks) # chunk idx: 5 had no matched and is removed as missing idx assert len(doc.chunks) == 4 for chunk in doc.chunks: assert GenericNdArray(chunk.embedding).value is not None embedding_array = GenericNdArray(chunk.embedding).value np.testing.assert_equal(embedding_array, np.array([int(chunk.id)]))
def random_docs(num_docs, chunks_per_doc=5, embed_dim=10, jitter=1): c_id = 3 * num_docs # avoid collision with docs for j in range(num_docs): d = jina_pb2.Document() d.tags['id'] = j d.text = b'hello world' GenericNdArray(d.embedding).value = np.random.random( [embed_dim + np.random.randint(0, jitter)]) d.id = uid.new_doc_id(d) for k in range(chunks_per_doc): c = d.chunks.add() c.text = 'i\'m chunk %d from doc %d' % (c_id, j) GenericNdArray(c.embedding).value = np.random.random( [embed_dim + np.random.randint(0, jitter)]) c.tags['id'] = c_id c.tags['parent_id'] = j c_id += 1 c.parent_id = d.id c.id = uid.new_doc_id(c) yield d
def get_duplicate_docs(num_docs=10): result = [] unique_set = set() for idx in range(num_docs): doc = jina_pb2.Document() content = int(idx / 2) GenericNdArray(doc.embedding).value = np.array([content]) doc.text = f'I am doc{content}' result.append(doc) unique_set.add(content) return result, len(unique_set)
def request(field_type): num_docs = 10 req = jina_pb2.Request.IndexRequest() for idx in range(num_docs): doc = req.docs.add() gt = req.groundtruths.add() chunk_doc = doc.chunks.add() chunk_gt = gt.chunks.add() chunk_doc.granularity = 1 chunk_gt.granularity = 1 if field_type == 'text': chunk_doc.text = 'aaa' chunk_gt.text = 'aaaa' elif field_type == 'buffer': chunk_doc.buffer = b'\x01\x02\x03' chunk_gt.buffer = b'\x01\x02\x03\x04' elif field_type == 'blob': GenericNdArray(chunk_doc.blob).value = np.array([1, 1, 1]) GenericNdArray(chunk_gt.blob).value = np.array([1, 1, 1, 1]) return req
def validate_fn(resp): assert len(resp.search.docs) == 1 doc = resp.search.docs[0] assert int(doc.tags['id']) == 1 assert len(doc.chunks) == 3 chunk0 = doc.chunks[0] assert int(chunk0.tags['id']) == 10 assert chunk0.text == text np.testing.assert_almost_equal(random_np_array, GenericNdArray(chunk0.embedding).value) chunk1 = doc.chunks[1] assert int(chunk1.tags['id']) == 20 np.testing.assert_almost_equal(random_np_array, GenericNdArray(chunk1.blob).value) chunk2 = doc.chunks[2] assert int(chunk2.tags['id']) == 30 assert chunk2.buffer == buffer
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) doc1 = jina_pb2.Document() doc1.id = '1' GenericNdArray(doc1.embedding).value = np.array([int(doc1.id)]) doc2 = jina_pb2.Document() doc2.id = '2' GenericNdArray(doc2.embedding).value = np.array([int(doc2.id)]) doc3 = jina_pb2.Document() doc3.id = '3' GenericNdArray(doc3.embedding).value = np.array([int(doc3.id)]) doc4 = jina_pb2.Document() doc4.id = '4' GenericNdArray(doc4.embedding).value = np.array([int(doc4.id)]) self.db = { 1: doc1.SerializeToString(), 2: doc2.SerializeToString(), 3: doc3.SerializeToString(), 4: doc4.SerializeToString() }