def test_collect_matches2doc_ranker_driver_max_ranker( keep_source_matches_as_chunks): doc = create_document_to_score_same_depth_level() driver = SimpleCollectMatchesRankDriver( docs=DocumentSet([doc]), keep_source_matches_as_chunks=keep_source_matches_as_chunks, ) executor = MockMaxRanker() driver.attach(executor=executor, runtime=None) driver() dm = list(doc.matches) assert len(dm) == 2 assert dm[0].id == '20' assert dm[0].score.value == 40 assert dm[1].id == '30' assert dm[1].score.value == 20 for match in dm: # match score is computed w.r.t to doc.id assert match.score.ref_id == doc.id expected_chunk_matches_length = 2 if keep_source_matches_as_chunks else 0 assert len(match.chunks) == expected_chunk_matches_length
def test_chunk2doc_ranker_driver_min_ranker(keep_source_matches_as_chunks): doc = create_document_to_score() driver = SimpleChunk2DocRankDriver( keep_source_matches_as_chunks=keep_source_matches_as_chunks) executor = MockMinRanker() driver.attach(executor=executor, runtime=None) driver._traverse_apply(DocumentSet([ doc, ])) assert len(doc.matches) == 4 assert doc.matches[0].id == '40' * 8 assert doc.matches[0].score.value == pytest.approx(1 / (1 + 4), 0.0001) assert doc.matches[1].id == '50' * 8 assert doc.matches[1].score.value == pytest.approx(1 / (1 + 5), 0.0001) assert doc.matches[2].id == '60' * 8 assert doc.matches[2].score.value == pytest.approx(1 / (1 + 6), 0.0001) assert doc.matches[3].id == '70' * 8 assert doc.matches[3].score.value == pytest.approx(1 / (1 + 7), 0.0001) for match in doc.matches: # match score is computed w.r.t to doc.id assert match.score.ref_id == doc.id expected_chunk_matches_length = 1 if keep_source_matches_as_chunks else 0 assert len(match.chunks) == expected_chunk_matches_length
def documentset(): """ Builds up a complete chunk-match structure, with a depth of 2 in both directions recursively. """ max_granularity = 2 max_adjacency = 2 def iterate_build(document, current_granularity, current_adjacency): if current_granularity < max_granularity: for i in range(DOCUMENTS_PER_LEVEL): chunk = add_chunk(document) iterate_build(chunk, chunk.granularity, chunk.adjacency) if current_adjacency < max_adjacency: for i in range(DOCUMENTS_PER_LEVEL): match = add_match(document) iterate_build(match, match.granularity, match.adjacency) docs = [] for base_id in range(DOCUMENTS_PER_LEVEL): with Document() as d: d.granularity = 0 d.adjacency = 0 docs.append(d) iterate_build(d, 0, 0) return DocumentSet(docs)
def test_chunk2doc_ranker_driver_mock_ranker(keep_source_matches_as_chunks): doc = create_document_to_score() driver = SimpleChunk2DocRankDriver( docs=DocumentSet([doc]), keep_source_matches_as_chunks=keep_source_matches_as_chunks, ) executor = MockLengthRanker() driver.attach(executor=executor, runtime=None) driver() assert len(doc.matches) == 4 assert doc.matches[0].id == '70' assert doc.matches[0].score.value == 7 assert doc.matches[1].id == '60' assert doc.matches[1].score.value == 6 assert doc.matches[2].id == '50' assert doc.matches[2].score.value == 5 assert doc.matches[3].id == '40' assert doc.matches[3].score.value == 4 for match in doc.matches: # match score is computed w.r.t to doc.id assert match.score.ref_id == doc.id expected_chunk_matches_length = 1 if keep_source_matches_as_chunks else 0 assert len(match.chunks) == expected_chunk_matches_length
def test_batching_mix_multi_flow(crafter, mocker): NUM_DOCS = 15 def validate_response(resp): assert len(resp.index.docs) == NUM_DOCS for i, doc in enumerate(resp.index.docs): assert doc.text == f'text-{i}-crafted' np.testing.assert_equal( NdArray(doc.embedding).value, np.array([i] * 5)) docs = DocumentSet([ Document( text=f'text-{i}', embedding=np.array([i] * 5), ) for i in range(NUM_DOCS) ]) mock = mocker.Mock() with Flow().add(name='crafter', uses=crafter) as f: f.index(inputs=docs, on_done=mock) mock.assert_called_once() validate_callback(mock, validate_response)
def test_segment_driver(): valid_doc = Document() valid_doc.update_id() valid_doc.text = 'valid' valid_doc.length = 2 valid_doc.mime_type = 'image/png' driver = SimpleSegmentDriver() executor = MockSegmenter() driver.attach(executor=executor, pea=None) driver._apply_all(DocumentSet([valid_doc])) assert valid_doc.length == 2 assert valid_doc.chunks[0].tags['id'] == 3 assert valid_doc.chunks[0].parent_id == valid_doc.id np.testing.assert_equal(valid_doc.chunks[0].blob, np.array([0.0, 0.0, 0.0])) assert valid_doc.chunks[0].weight == 0. assert valid_doc.chunks[0].length == 3 assert valid_doc.chunks[0].mime_type == 'text/plain' assert valid_doc.chunks[1].tags['id'] == 4 assert valid_doc.chunks[1].parent_id == valid_doc.id np.testing.assert_equal(valid_doc.chunks[1].blob, np.array([1.0, 1.0, 1.0])) assert valid_doc.chunks[1].weight == 1. assert valid_doc.chunks[1].length == 3 assert valid_doc.chunks[1].mime_type == 'image/png' assert valid_doc.chunks[2].tags['id'] == 5 assert valid_doc.chunks[2].parent_id == valid_doc.id np.testing.assert_equal(valid_doc.chunks[2].blob, np.array([2.0, 2.0, 2.0])) assert valid_doc.chunks[2].weight == 2. assert valid_doc.chunks[2].length == 3 assert valid_doc.chunks[2].mime_type == 'image/png'
def test_collect_matches2doc_ranker_driver_min_ranker( keep_source_matches_as_chunks): doc = create_document_to_score_same_depth_level() driver = SimpleCollectMatchesRankDriver( docs=DocumentSet([doc]), keep_source_matches_as_chunks=keep_source_matches_as_chunks, ) executor = MockMinRanker() driver.attach(executor=executor, runtime=None) import sys min_value_30 = sys.maxsize min_value_20 = sys.maxsize for match in doc.matches: if match.parent_id == '30' * 8: if match.score.value < min_value_30: min_value_30 = match.score.value if match.parent_id == '20' * 8: if match.score.value < min_value_20: min_value_20 = match.score.value assert min_value_30 < min_value_20 driver() dm = list(doc.matches) assert len(dm) == 2 assert dm[0].id == '30' * 8 assert dm[0].score.value == pytest.approx((1.0 / (1.0 + min_value_30)), 0.0000001) assert dm[1].id == '20' * 8 assert dm[1].score.value == pytest.approx((1.0 / (1.0 + min_value_20)), 0.0000001) for match in dm: # match score is computed w.r.t to doc.id assert match.score.ref_id == doc.id expected_chunk_matches_length = 2 if keep_source_matches_as_chunks else 0 assert len(match.chunks) == expected_chunk_matches_length
def test_chunk2doc_ranker_driver_max_ranker(keep_source_matches_as_chunks): doc = create_document_to_score() driver = SimpleChunk2DocRankDriver( docs=DocumentSet([doc]), keep_source_matches_as_chunks=keep_source_matches_as_chunks, ) executor = MockMaxRanker() driver.attach(executor=executor, runtime=None) driver() scale = 1 if not isinstance(executor, MockPriceDiscountRanker) else DISCOUNT_VAL assert len(doc.matches) == 4 assert doc.matches[0].id == '70' assert doc.matches[0].score.value == 7 * scale assert doc.matches[1].id == '60' assert doc.matches[1].score.value == 6 * scale assert doc.matches[2].id == '50' assert doc.matches[2].score.value == 5 * scale assert doc.matches[3].id == '40' assert doc.matches[3].score.value == 4 * scale for match in doc.matches: # match score is computed w.r.t to doc.id assert match.score.ref_id == doc.id expected_chunk_matches_length = 1 if keep_source_matches_as_chunks else 0 assert len(match.chunks) == expected_chunk_matches_length
def docs_to_index(num_docs): docs = [] for idx in range(1, num_docs + 1): doc = Document(id=str(idx), content=np.array([idx * 5])) docs.append(doc) return DocumentSet(docs)
def docset(docs): return DocumentSet(docs)
def docs_to_encode(num_docs): docs = [] for idx in range(num_docs): doc = Document(content=np.array([idx])) docs.append(doc) return DocumentSet(docs)