Example #1
0
def test_batching_text_one_argument(stack, crafter):
    docs = DocumentArray([Document(text=f'text-{i}') for i in range(15)])
    texts, _ = docs.extract_docs('text', stack_contents=stack)

    crafted_docs = crafter.craft(texts)
    for i, crafted_doc in enumerate(crafted_docs):
        assert crafted_doc['text'] == f'text-{i}-crafted'
def test_match2docranker_batching_flow(ranker, mocker):
    NUM_DOCS_QUERIES = 15
    NUM_MATCHES = 10
    queries = DocumentArray([])
    for i in range(NUM_DOCS_QUERIES):
        query = Document(id=f'query-{i}')
        for j in range(NUM_MATCHES):
            m = Document(id=f'match-{i}-{j}', tags={'dummy_score': j})
            query.matches.append(m)
        queries.append(query)

    def validate_response(resp):
        assert len(resp.search.docs) == NUM_DOCS_QUERIES
        for i, query in enumerate(resp.search.docs):
            for j, match in enumerate(query.matches, 1):
                assert match.id == f'match-{i}-{NUM_MATCHES - j}'
                assert match.score.value == NUM_MATCHES - j

    mock = mocker.Mock()

    with Flow().add(name='ranker', uses=ranker) as f:
        f.search(inputs=queries, on_done=mock)

    mock.assert_called_once()
    validate_callback(mock, validate_response)
Example #3
0
def test_get_content_multiple_fields_merge(stack, num_rows):
    fields = ['embedding', 'text']

    batch_size = 10
    embed_size = 20

    kwargs = {
        field: np.random.random((num_rows, embed_size))
        if field == 'embedding'
        else 'text'
        for field in fields
    }
    docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)])

    contents, pts = docs.extract_docs(*fields, stack_contents=stack)

    assert len(contents) == len(fields)
    assert isinstance(contents, list)
    if stack:
        assert isinstance(contents[0], np.ndarray)
        assert isinstance(contents[1], np.ndarray)

    for content in contents:
        assert len(content) == batch_size

    if stack:
        assert contents[0].shape == (batch_size, num_rows, embed_size)
        assert contents[1].shape == (batch_size,)
    else:
        assert len(contents[0]) == batch_size
        assert len(contents[1]) == batch_size
        for c in contents[0]:
            assert c.shape == (num_rows, embed_size)
Example #4
0
def test_batching_blob_one_argument(stack, crafter):
    docs = DocumentArray(
        [Document(blob=np.array([[i] * 5, [i] * 5])) for i in range(15)])
    texts, _ = docs.extract_docs('blob', stack_contents=stack)

    crafted_docs = crafter.craft(texts)
    for i, crafted_doc in enumerate(crafted_docs):
        np.testing.assert_equal(crafted_doc['blob'],
                                np.array([[i] * 5, [i] * 5]))
Example #5
0
def test_batching_text_one_argument(segmenter):
    docs = DocumentArray([Document(text=f'text-{i}') for i in range(15)])
    texts, _ = docs.extract_docs('text')

    chunks_sets = segmenter.segment(texts)
    for i, chunks in enumerate(chunks_sets):
        assert len(chunks) == NUM_CHUNKS
        for j, chunk in enumerate(chunks):
            assert chunk['text'] == f'text-{i}-chunk-{j}'
Example #6
0
def test_union(docarray, document_factory):
    additional_docarray = DocumentArray([])
    for idx in range(4, 10):
        doc = document_factory.create(idx, f'test {idx}')
        additional_docarray.add(doc)
    union = docarray + additional_docarray
    for idx in range(0, 3):
        assert union[idx].id == docarray[idx].id
    for idx in range(0, 6):
        assert union[idx + 3].id == additional_docarray[idx].id
Example #7
0
def test_batching_text_multi(stack, crafter):
    docs = DocumentArray(
        [Document(text=f'text-{i}', id=f'id-{i}') for i in range(15)])
    required_keys = ['text', 'id']
    text_ids, _ = docs.extract_docs(*required_keys, stack_contents=stack)

    crafted_docs = crafter.craft(*text_ids)

    for i, crafted_doc in enumerate(crafted_docs):
        assert crafted_doc['text'] == f'text-{i}-crafted'
        assert crafted_doc['id'] == f'id-{i}-crafted'
Example #8
0
def test_batching_mix_multi(stack, crafter):
    docs = DocumentArray([
        Document(text=f'text-{i}', embedding=np.array([i] * 5))
        for i in range(15)
    ])
    required_keys = ['text', 'embedding']
    text_ids, _ = docs.extract_docs(*required_keys, stack_contents=stack)

    crafted_docs = crafter.craft(*text_ids)

    for i, crafted_doc in enumerate(crafted_docs):
        assert crafted_doc['text'] == f'text-{i}-crafted'
        np.testing.assert_equal(crafted_doc['embedding'], np.array([i] * 5))
Example #9
0
def test_get_content_text_fields(stack, field):
    batch_size = 10

    kwargs = {field: 'text'}

    docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)])

    contents, pts = docs.extract_docs(field, stack_contents=stack)
    if stack:
        assert isinstance(contents, np.ndarray)
        assert contents.shape == (batch_size,)
    assert len(contents) == batch_size
    for content in contents:
        assert content == 'text'
Example #10
0
def test_get_content_bytes_fields(stack, bytes_input, field):
    batch_size = 10

    kwargs = {field: bytes_input}

    docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)])

    contents, pts = docs.extract_docs(field, stack_contents=stack)

    assert len(contents) == batch_size
    assert isinstance(contents, list)
    for content in contents:
        assert isinstance(content, bytes)
        assert content == bytes_input
Example #11
0
def test_batching_encode_text(encoder):
    docs = DocumentArray([Document(text=f'text-{i}') for i in range(15)])
    texts, _ = docs.all_contents

    embeds = encoder.encode(texts)

    assert embeds.shape == (15, 10)
Example #12
0
def test_as_blob_driver():
    docs = DocumentArray(random_docs(2))
    driver = MockPrediction2DocBlobDriver()
    driver._apply_all(docs)

    for d in docs:
        assert NdArray(d.blob).value.shape == (3, )
Example #13
0
def test_batching_blob_multi(stack, crafter):
    docs = DocumentArray([
        Document(
            blob=np.array([[i] * 5, [i] * 5]),
            embedding=np.array([i] * 5),
        ) for i in range(15)
    ])
    required_keys = ['blob', 'embedding']
    text_ids, _ = docs.extract_docs(*required_keys, stack_contents=stack)

    crafted_docs = crafter.craft(*text_ids)

    for i, crafted_doc in enumerate(crafted_docs):
        np.testing.assert_equal(crafted_doc['blob'],
                                np.array([[i] * 5, [i] * 5]))
        np.testing.assert_equal(crafted_doc['embedding'], np.array([i] * 5))
Example #14
0
def test_segment_driver(segment_driver, text_segmenter_executor):
    valid_doc = Document()
    valid_doc.text = 'valid'
    valid_doc.mime_type = 'image/png'

    segment_driver.attach(executor=text_segmenter_executor, runtime=None)
    segment_driver._apply_all(DocumentArray([valid_doc]))

    assert valid_doc.chunks[0].tags['id'] == 3
    assert valid_doc.chunks[0].parent_id == valid_doc.id
    np.testing.assert_equal(valid_doc.chunks[0].blob, np.array([0.0, 0.0,
                                                                0.0]))
    assert valid_doc.chunks[0].weight == 0.0
    assert valid_doc.chunks[0].mime_type == 'text/plain'

    assert valid_doc.chunks[1].tags['id'] == 4
    assert valid_doc.chunks[1].parent_id == valid_doc.id
    np.testing.assert_equal(valid_doc.chunks[1].blob, np.array([1.0, 1.0,
                                                                1.0]))
    assert valid_doc.chunks[1].weight == 1.0
    assert valid_doc.chunks[1].mime_type == 'image/png'

    assert valid_doc.chunks[2].tags['id'] == 5
    assert valid_doc.chunks[2].parent_id == valid_doc.id
    np.testing.assert_equal(valid_doc.chunks[2].blob, np.array([2.0, 2.0,
                                                                2.0]))
    assert valid_doc.chunks[2].weight == 2.0
    assert valid_doc.chunks[2].mime_type == 'image/png'
Example #15
0
def test_get_content(stack, num_rows, field):
    batch_size = 10
    embed_size = 20

    kwargs = {field: np.random.random((num_rows, embed_size))}

    docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)])
    docs.append(Document())

    contents, pts = docs.extract_docs(field, stack_contents=stack)
    if stack:
        assert isinstance(contents, np.ndarray)
        assert contents.shape == (batch_size, num_rows, embed_size)
    else:
        assert len(contents) == batch_size
        for content in contents:
            assert content.shape == (num_rows, embed_size)
Example #16
0
def test_multi_label_predict_driver():
    docs = DocumentArray(random_docs(2))
    driver = MockMultiLabelPredictDriver(labels=['cat', 'dog', 'human'])
    driver._apply_all(docs)

    for d in docs:
        assert isinstance(d.tags['prediction'], ListValue)
        for t in d.tags['prediction']:
            assert t in {'cat', 'dog', 'human'}

    docs = DocumentArray(random_docs(2))
    driver = MockAllLabelPredictDriver(labels=['cat', 'dog', 'human'])
    driver._apply_all(docs)

    for d in docs:
        assert isinstance(d.tags['prediction'], ListValue)
        assert list(d.tags['prediction']) == ['cat', 'dog', 'human']
Example #17
0
def test_batching_encode_blob(encoder):
    docs = DocumentArray(
        [Document(blob=np.random.random((10, 20))) for _ in range(15)])
    blob, _ = docs.all_contents

    embeds = encoder.encode(blob)

    assert embeds.shape == (15, 10)
Example #18
0
def test_binary_predict_driver():
    docs = DocumentArray(random_docs(2))
    driver = MockBinaryPredictDriver()
    driver._apply_all(docs)

    for d in docs:
        assert d.tags['prediction'] in {'yes', 'no'}
        for c in d.chunks:
            assert c.tags['prediction'] in {'yes', 'no'}
Example #19
0
def test_one_hot_predict_driver():
    docs = DocumentArray(random_docs(2))
    driver = MockOneHotPredictDriver(labels=['cat', 'dog', 'human'])
    driver._apply_all(docs)

    for d in docs:
        assert d.tags['prediction'] in {'cat', 'dog', 'human'}
        for c in d.chunks:
            assert c.tags['prediction'] in {'cat', 'dog', 'human'}
Example #20
0
def test_doc_array_from_generator():
    NUM_DOCS = 100

    def generate():
        for _ in range(NUM_DOCS):
            yield Document()

    doc_array = DocumentArray(generate())
    assert len(doc_array) == NUM_DOCS
Example #21
0
def test_broken_document(segment_driver, text_segmenter_executor):
    segment_driver.attach(executor=text_segmenter_executor, runtime=None)

    invalid_doc = Document()
    invalid_doc.id = 1
    invalid_doc.text = 'invalid'

    with pytest.raises(AttributeError):
        segment_driver._apply_all([DocumentArray([invalid_doc])])
Example #22
0
def test_get_content_multiple_fields_text(stack, fields):
    batch_size = 10

    kwargs = {field: f'text-{field}' for field in fields}

    docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)])

    contents, pts = docs.extract_docs(*fields, stack_contents=stack)

    assert len(contents) == len(fields)
    assert isinstance(contents, list)
    if stack:
        assert isinstance(contents[0], np.ndarray)
        assert isinstance(contents[1], np.ndarray)

    for content in contents:
        assert len(content) == batch_size
        if stack:
            assert content.shape == (batch_size,)
Example #23
0
def test_image_segmenter(segment_driver, image_segmenter_executor):
    blob1 = np.random.random((1, 32, 64))
    blob2 = np.random.random((1, 64, 32))
    docs = DocumentArray([Document(blob=blob1), Document(blob=blob2)])
    segment_driver.attach(executor=image_segmenter_executor, runtime=None)
    segment_driver._apply_all(docs)
    for doc in docs:
        assert len(doc.chunks) == 1
    np.testing.assert_equal(docs[0].chunks[0].blob, blob1)
    np.testing.assert_equal(docs[1].chunks[0].blob, blob2)
Example #24
0
def docarray_with_scipy_sparse_embedding(docs):
    embedding = coo_matrix(
        (
            np.array([1, 2, 3, 4, 5, 6]),
            (np.array([0, 0, 0, 0, 0, 0]), np.array([0, 2, 2, 0, 1, 2])),
        ),
        shape=(1, 10),
    )
    for doc in docs:
        doc.embedding = embedding
    return DocumentArray(docs)
Example #25
0
def test_get_content_multiple_fields_text_buffer(stack, bytes_input):
    batch_size = 10
    fields = ['id', 'buffer']
    kwargs = {'id': 'text', 'buffer': bytes_input}

    docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)])

    contents, pts = docs.extract_docs(*fields, stack_contents=stack)

    assert len(contents) == len(fields)
    assert isinstance(contents, list)
    assert len(contents[0]) == batch_size
    if stack:
        assert isinstance(contents[0], np.ndarray)
        assert contents[0].shape == (batch_size,)
    assert isinstance(contents[1], list)
    assert isinstance(contents[1][0], bytes)

    for content in contents:
        assert len(content) == batch_size
Example #26
0
def test_array_get_from_slice_success(docs, document_factory):
    docarray = DocumentArray(docs)
    assert len(docarray[:1]) == 1
    assert len(docarray[:2]) == 2
    assert len(docarray[:3]) == 3
    assert len(docarray[:100]) == 3

    assert len(docarray[1:]) == 2
    assert len(docarray[2:]) == 1
    assert len(docarray[3:]) == 0
    assert len(docarray[100:]) == 0
def test_match2docranker_batching(ranker):
    NUM_DOCS_QUERIES = 15
    NUM_MATCHES = 10

    old_matches_scores = []
    queries_metas = []
    matches_metas = []
    queries = DocumentArray([])
    for i in range(NUM_DOCS_QUERIES):
        old_match_scores = []
        match_metas = []
        query = Document(id=f'query-{i}')
        for j in range(NUM_MATCHES):
            m = Document(id=f'match-{i}-{j}', tags={'dummy_score': j})
            query.matches.append(m)
            old_match_scores.append(0)
            match_metas.append(m.get_attrs('tags__dummy_score'))
        queries.append(query)
        old_matches_scores.append(old_match_scores)
        queries_metas.append(None)
        matches_metas.append(match_metas)

    queries_scores = ranker.score(old_matches_scores, queries_metas,
                                  matches_metas)
    assert len(queries_scores) == NUM_DOCS_QUERIES

    for i, (query, matches_scores) in enumerate(zip(queries, queries_scores)):
        assert len(matches_scores) == NUM_MATCHES
        for j, (match, score) in enumerate(zip(query.matches, matches_scores)):
            match.score = NamedScore(value=j)
            assert score == j

        query.matches.sort(key=lambda x: x.score.value, reverse=True)

        for j, match in enumerate(query.matches, 1):
            assert match.id == f'match-{i}-{NUM_MATCHES - j}'
            assert match.score.value == NUM_MATCHES - j
def test_collect_matches2doc_ranker_driver_mock_ranker():
    doc = create_document_to_score_same_depth_level()
    driver = SimpleCollectMatchesRankDriver(docs=DocumentArray([doc]))
    executor = MockLengthRanker()
    driver.attach(executor=executor, runtime=None)
    driver()
    dm = list(doc.matches)
    assert len(dm) == 2
    assert dm[0].id == '20'
    assert dm[0].score.value == 3.0
    assert dm[1].id == '30'
    assert dm[1].score.value == 2.0
    for match in dm:
        # match score is computed w.r.t to doc.id
        assert match.score.ref_id == doc.id
Example #29
0
def test_batching_text_one_argument_flow(crafter, mocker):
    NUM_DOCS = 15

    def validate_response(resp):
        assert len(resp.index.docs) == NUM_DOCS
        for i, doc in enumerate(resp.index.docs):
            assert doc.text == f'text-{i}-crafted'

    docs = DocumentArray([Document(text=f'text-{i}') for i in range(NUM_DOCS)])
    mock = mocker.Mock()

    with Flow().add(name='crafter', uses=crafter) as f:
        f.index(inputs=docs, on_done=mock)

    mock.assert_called_once()
    validate_callback(mock, validate_response)
Example #30
0
def test_chunks_exist_already(segment_driver, text_segmenter_executor):
    document = Document(
        text='valid', chunks=[Document(text='test2'),
                              Document(text='test3')])
    # before segmentation
    assert len(document.chunks) == 2
    for chunk in document.chunks:
        assert chunk.parent_id == document.id
        assert chunk.siblings == 2
    segment_driver.attach(executor=text_segmenter_executor, runtime=None)
    segment_driver._apply_all(DocumentArray([document]))

    # after segmentation
    assert len(document.chunks) == 5
    for chunk in document.chunks:
        assert chunk.parent_id == document.id
        assert chunk.siblings == 5