コード例 #1
0
def test_collect_matches2doc_ranker_driver_max_ranker(
        keep_source_matches_as_chunks):
    doc = create_document_to_score_same_depth_level()
    driver = SimpleCollectMatchesRankDriver(
        docs=DocumentSet([doc]),
        keep_source_matches_as_chunks=keep_source_matches_as_chunks,
    )
    executor = MockMaxRanker()
    driver.attach(executor=executor, runtime=None)
    driver()
    dm = list(doc.matches)
    assert len(dm) == 2
    assert dm[0].id == '20'
    assert dm[0].score.value == 40
    assert dm[1].id == '30'
    assert dm[1].score.value == 20
    for match in dm:
        # match score is computed w.r.t to doc.id
        assert match.score.ref_id == doc.id
        expected_chunk_matches_length = 2 if keep_source_matches_as_chunks else 0
        assert len(match.chunks) == expected_chunk_matches_length
コード例 #2
0
def test_chunk2doc_ranker_driver_min_ranker(keep_source_matches_as_chunks):
    doc = create_document_to_score()
    driver = SimpleChunk2DocRankDriver(
        keep_source_matches_as_chunks=keep_source_matches_as_chunks)
    executor = MockMinRanker()
    driver.attach(executor=executor, runtime=None)
    driver._traverse_apply(DocumentSet([
        doc,
    ]))
    assert len(doc.matches) == 4
    assert doc.matches[0].id == '40' * 8
    assert doc.matches[0].score.value == pytest.approx(1 / (1 + 4), 0.0001)
    assert doc.matches[1].id == '50' * 8
    assert doc.matches[1].score.value == pytest.approx(1 / (1 + 5), 0.0001)
    assert doc.matches[2].id == '60' * 8
    assert doc.matches[2].score.value == pytest.approx(1 / (1 + 6), 0.0001)
    assert doc.matches[3].id == '70' * 8
    assert doc.matches[3].score.value == pytest.approx(1 / (1 + 7), 0.0001)
    for match in doc.matches:
        # match score is computed w.r.t to doc.id
        assert match.score.ref_id == doc.id
        expected_chunk_matches_length = 1 if keep_source_matches_as_chunks else 0
        assert len(match.chunks) == expected_chunk_matches_length
コード例 #3
0
def documentset():
    """ Builds up a complete chunk-match structure, with a depth of 2 in both directions recursively. """
    max_granularity = 2
    max_adjacency = 2

    def iterate_build(document, current_granularity, current_adjacency):
        if current_granularity < max_granularity:
            for i in range(DOCUMENTS_PER_LEVEL):
                chunk = add_chunk(document)
                iterate_build(chunk, chunk.granularity, chunk.adjacency)
        if current_adjacency < max_adjacency:
            for i in range(DOCUMENTS_PER_LEVEL):
                match = add_match(document)
                iterate_build(match, match.granularity, match.adjacency)

    docs = []
    for base_id in range(DOCUMENTS_PER_LEVEL):
        with Document() as d:
            d.granularity = 0
            d.adjacency = 0
            docs.append(d)
            iterate_build(d, 0, 0)
    return DocumentSet(docs)
コード例 #4
0
def test_chunk2doc_ranker_driver_mock_ranker(keep_source_matches_as_chunks):
    doc = create_document_to_score()
    driver = SimpleChunk2DocRankDriver(
        docs=DocumentSet([doc]),
        keep_source_matches_as_chunks=keep_source_matches_as_chunks,
    )
    executor = MockLengthRanker()
    driver.attach(executor=executor, runtime=None)
    driver()
    assert len(doc.matches) == 4
    assert doc.matches[0].id == '70'
    assert doc.matches[0].score.value == 7
    assert doc.matches[1].id == '60'
    assert doc.matches[1].score.value == 6
    assert doc.matches[2].id == '50'
    assert doc.matches[2].score.value == 5
    assert doc.matches[3].id == '40'
    assert doc.matches[3].score.value == 4
    for match in doc.matches:
        # match score is computed w.r.t to doc.id
        assert match.score.ref_id == doc.id
        expected_chunk_matches_length = 1 if keep_source_matches_as_chunks else 0
        assert len(match.chunks) == expected_chunk_matches_length
コード例 #5
0
def test_batching_mix_multi_flow(crafter, mocker):
    NUM_DOCS = 15

    def validate_response(resp):
        assert len(resp.index.docs) == NUM_DOCS
        for i, doc in enumerate(resp.index.docs):
            assert doc.text == f'text-{i}-crafted'
            np.testing.assert_equal(
                NdArray(doc.embedding).value, np.array([i] * 5))

    docs = DocumentSet([
        Document(
            text=f'text-{i}',
            embedding=np.array([i] * 5),
        ) for i in range(NUM_DOCS)
    ])
    mock = mocker.Mock()

    with Flow().add(name='crafter', uses=crafter) as f:
        f.index(inputs=docs, on_done=mock)

    mock.assert_called_once()
    validate_callback(mock, validate_response)
コード例 #6
0
def test_segment_driver():
    valid_doc = Document()
    valid_doc.update_id()
    valid_doc.text = 'valid'
    valid_doc.length = 2
    valid_doc.mime_type = 'image/png'

    driver = SimpleSegmentDriver()
    executor = MockSegmenter()
    driver.attach(executor=executor, pea=None)
    driver._apply_all(DocumentSet([valid_doc]))

    assert valid_doc.length == 2

    assert valid_doc.chunks[0].tags['id'] == 3
    assert valid_doc.chunks[0].parent_id == valid_doc.id
    np.testing.assert_equal(valid_doc.chunks[0].blob, np.array([0.0, 0.0,
                                                                0.0]))
    assert valid_doc.chunks[0].weight == 0.
    assert valid_doc.chunks[0].length == 3
    assert valid_doc.chunks[0].mime_type == 'text/plain'

    assert valid_doc.chunks[1].tags['id'] == 4
    assert valid_doc.chunks[1].parent_id == valid_doc.id
    np.testing.assert_equal(valid_doc.chunks[1].blob, np.array([1.0, 1.0,
                                                                1.0]))
    assert valid_doc.chunks[1].weight == 1.
    assert valid_doc.chunks[1].length == 3
    assert valid_doc.chunks[1].mime_type == 'image/png'

    assert valid_doc.chunks[2].tags['id'] == 5
    assert valid_doc.chunks[2].parent_id == valid_doc.id
    np.testing.assert_equal(valid_doc.chunks[2].blob, np.array([2.0, 2.0,
                                                                2.0]))
    assert valid_doc.chunks[2].weight == 2.
    assert valid_doc.chunks[2].length == 3
    assert valid_doc.chunks[2].mime_type == 'image/png'
コード例 #7
0
def test_collect_matches2doc_ranker_driver_min_ranker(
        keep_source_matches_as_chunks):
    doc = create_document_to_score_same_depth_level()
    driver = SimpleCollectMatchesRankDriver(
        docs=DocumentSet([doc]),
        keep_source_matches_as_chunks=keep_source_matches_as_chunks,
    )
    executor = MockMinRanker()
    driver.attach(executor=executor, runtime=None)
    import sys

    min_value_30 = sys.maxsize
    min_value_20 = sys.maxsize
    for match in doc.matches:
        if match.parent_id == '30' * 8:
            if match.score.value < min_value_30:
                min_value_30 = match.score.value
        if match.parent_id == '20' * 8:
            if match.score.value < min_value_20:
                min_value_20 = match.score.value

    assert min_value_30 < min_value_20
    driver()
    dm = list(doc.matches)
    assert len(dm) == 2
    assert dm[0].id == '30' * 8
    assert dm[0].score.value == pytest.approx((1.0 / (1.0 + min_value_30)),
                                              0.0000001)
    assert dm[1].id == '20' * 8
    assert dm[1].score.value == pytest.approx((1.0 / (1.0 + min_value_20)),
                                              0.0000001)
    for match in dm:
        # match score is computed w.r.t to doc.id
        assert match.score.ref_id == doc.id
        expected_chunk_matches_length = 2 if keep_source_matches_as_chunks else 0
        assert len(match.chunks) == expected_chunk_matches_length
コード例 #8
0
def test_chunk2doc_ranker_driver_max_ranker(keep_source_matches_as_chunks):
    doc = create_document_to_score()
    driver = SimpleChunk2DocRankDriver(
        docs=DocumentSet([doc]),
        keep_source_matches_as_chunks=keep_source_matches_as_chunks,
    )
    executor = MockMaxRanker()
    driver.attach(executor=executor, runtime=None)
    driver()
    scale = 1 if not isinstance(executor, MockPriceDiscountRanker) else DISCOUNT_VAL
    assert len(doc.matches) == 4
    assert doc.matches[0].id == '70'
    assert doc.matches[0].score.value == 7 * scale
    assert doc.matches[1].id == '60'
    assert doc.matches[1].score.value == 6 * scale
    assert doc.matches[2].id == '50'
    assert doc.matches[2].score.value == 5 * scale
    assert doc.matches[3].id == '40'
    assert doc.matches[3].score.value == 4 * scale
    for match in doc.matches:
        # match score is computed w.r.t to doc.id
        assert match.score.ref_id == doc.id
        expected_chunk_matches_length = 1 if keep_source_matches_as_chunks else 0
        assert len(match.chunks) == expected_chunk_matches_length
コード例 #9
0
def docs_to_index(num_docs):
    docs = []
    for idx in range(1, num_docs + 1):
        doc = Document(id=str(idx), content=np.array([idx * 5]))
        docs.append(doc)
    return DocumentSet(docs)
コード例 #10
0
def docset(docs):
    return DocumentSet(docs)
コード例 #11
0
def docs_to_encode(num_docs):
    docs = []
    for idx in range(num_docs):
        doc = Document(content=np.array([idx]))
        docs.append(doc)
    return DocumentSet(docs)