Beispiel #1
0
 def build_document(chunk=None):
     d = Document()
     d.chunks.append(chunk)
     d.chunks[0].update_content_hash(exclude_fields=('parent_id', 'id',
                                                     'content_hash'))
     d.chunks[0].parent_id = 0
     d.update_content_hash(include_fields=('chunks', ), exclude_fields=None)
     return d
Beispiel #2
0
def test_cache_content_driver_same_content(tmpdir):
    doc1 = Document(id=1)
    doc1.text = 'blabla'
    doc1.update_content_hash()
    docs1 = DocumentSet([doc1])

    doc2 = Document(id=2)
    doc2.text = 'blabla'
    doc2.update_content_hash()
    docs2 = DocumentSet([doc2])
    assert doc1.content_hash == doc2.content_hash

    driver = MockBaseCacheDriver()
    filename = None

    with DocIDCache(tmpdir, field=CONTENT_HASH_KEY) as executor:
        driver.attach(executor=executor, runtime=None)
        driver._traverse_apply(docs1)

        with pytest.raises(NotImplementedError):
            driver._traverse_apply(docs2)

        assert executor.size == 1
        filename = executor.save_abspath

    # update
    old_doc = Document(id=9999)
    old_doc.text = 'blabla'
    old_doc.update_content_hash()

    new_string = 'blabla-new'
    doc1.text = new_string
    doc1.update_content_hash()
    with BaseExecutor.load(filename) as executor:
        executor.update([UniqueId(1)], [doc1.content_hash])

    with BaseExecutor.load(filename) as executor:
        assert executor.query(doc1.content_hash) is True
        assert executor.query(old_doc.content_hash) is None

    # delete
    with BaseExecutor.load(filename) as executor:
        executor.delete([UniqueId(doc1.id)])

    with BaseExecutor.load(filename) as executor:
        assert executor.query(doc1.content_hash) is None
Beispiel #3
0
def test_copy_construct():
    a = DocumentProto()
    b = Document(a, copy=True)
    a.id = '1' * 16
    assert b.id != '1' * 16

    b.id = '2' * 16
    assert a.id == '1' * 16
Beispiel #4
0
def test_include_scalar():
    d1 = Document()
    d1.text = 'hello'
    dd1 = Document()
    d1.chunks.append(dd1)
    d1.update_content_hash(include_fields=('text', ), exclude_fields=None)

    d2 = Document()
    d2.text = 'hello'
    d2.update_content_hash(include_fields=('text', ), exclude_fields=None)

    assert d1.content_hash == d2.content_hash

    # change text should result in diff hash
    d2.text = 'world'
    d2.update_content_hash(include_fields=('text', ), exclude_fields=None)
    assert d1.content_hash != d2.content_hash
Beispiel #5
0
def test_bad_good_doc_id():
    b = Document()
    with pytest.raises(BadDocID):
        b.id = 'hello'
    b.id = 'abcd' * 4
    b.id = 'de09' * 4
    b.id = 'af54' * 4
    b.id = 'abcdef0123456789'
Beispiel #6
0
def test_cache_content_driver_same_content(tmpdir, test_metas):
    doc1 = Document(id='1')
    doc1.text = 'blabla'
    doc1.update_content_hash()
    docs1 = DocumentArray([doc1])

    doc2 = Document(id='2')
    doc2.text = 'blabla'
    doc2.update_content_hash()
    docs2 = DocumentArray([doc2])
    assert doc1.content_hash == doc2.content_hash

    driver = MockBaseCacheDriver()

    with DocCache(tmpdir, metas=test_metas,
                  fields=(CONTENT_HASH_KEY, )) as executor:
        driver.attach(executor=executor, runtime=None)
        driver._apply_all(docs1)

        with pytest.raises(NotImplementedError):
            driver._apply_all(docs2)

        assert executor.size == 1
        filename = executor.save_abspath

    # update
    old_doc = Document(id=9999)
    old_doc.text = 'blabla'
    old_doc.update_content_hash()

    new_string = 'blabla-new'
    doc1.text = new_string
    doc1.update_content_hash()
    with BaseExecutor.load(filename) as executor:
        executor.update(['1'], [doc1.content_hash])

    with BaseExecutor.load(filename) as executor:
        assert executor.query(doc1.content_hash) is True
        assert executor.query(old_doc.content_hash) is False

    # delete
    with BaseExecutor.load(filename) as executor:
        executor.delete([doc1.id])

    with BaseExecutor.load(filename) as executor:
        assert executor.query(doc1.content_hash) is False
def updated_documents():
    docs = []
    for idx in range(3):
        with Document(text='updated_' + f'{idx}') as d:
            d.id = f'{idx:0>16}'
            d.embedding = np.random.random([10])
            docs.append(d)
    return DocumentSet(docs)
Beispiel #8
0
def test_uri_get_set():
    a = Document()
    a.uri = 'https://abc.com/a.jpg'
    assert a.uri == 'https://abc.com/a.jpg'
    assert a.mime_type == 'image/jpeg'

    with pytest.raises(ValueError):
        a.uri = 'abcdefg'
Beispiel #9
0
def test_document_sparse_attributes_scipy(scipy_sparse_matrix):
    d = Document()
    d.embedding = scipy_sparse_matrix
    d.blob = scipy_sparse_matrix
    np.testing.assert_array_equal(d.embedding.todense(),
                                  scipy_sparse_matrix.todense())
    np.testing.assert_array_equal(d.blob.todense(),
                                  scipy_sparse_matrix.todense())
Beispiel #10
0
def test_document_pretty_dict():
    doc = Document(
        blob=np.array([[0, 1, 2], [2, 1, 0]]),
        embedding=np.array([1.0, 2.0, 3.0]),
        tags={'hello': 'world'},
    )
    chunk = Document(doc, copy=True)
    chunk.blob = np.array([[3, 4, 5], [5, 4, 3]])
    chunk.embedding = np.array([4.0, 5.0, 6.0])
    match = Document(doc, copy=True)
    match.blob = np.array([[6, 7, 8], [8, 7, 6]])
    match.embedding = np.array([7.0, 8.0, 9.0])
    doc.chunks.append(chunk)
    doc.matches.append(match)
    assert doc.tags == {'hello': 'world'}
    assert doc.blob.tolist() == [[0, 1, 2], [2, 1, 0]]
    assert doc.embedding.tolist() == [1.0, 2.0, 3.0]
    assert doc.chunks[0].tags == {'hello': 'world'}
    assert doc.chunks[0].blob.tolist() == [[3, 4, 5], [5, 4, 3]]
    assert doc.chunks[0].embedding.tolist() == [4.0, 5.0, 6.0]
    assert doc.matches[0].tags == {'hello': 'world'}
    assert doc.matches[0].blob.tolist() == [[6, 7, 8], [8, 7, 6]]
    assert doc.matches[0].embedding.tolist() == [7.0, 8.0, 9.0]

    d = doc.dict(prettify_ndarrays=True)
    assert d['blob'] == [[0, 1, 2], [2, 1, 0]]
    assert d['embedding'] == [1.0, 2.0, 3.0]
    assert d['tags'] == {'hello': 'world'}
    assert d['chunks'][0]['blob'] == [[3, 4, 5], [5, 4, 3]]
    assert d['chunks'][0]['embedding'] == [4.0, 5.0, 6.0]
    assert d['chunks'][0]['tags'] == {'hello': 'world'}
    assert d['matches'][0]['blob'] == [[6, 7, 8], [8, 7, 6]]
    assert d['matches'][0]['embedding'] == [7.0, 8.0, 9.0]
    assert d['matches'][0]['tags'] == {'hello': 'world'}

    d_reconstructed = Document(d)
    assert d_reconstructed.tags == {'hello': 'world'}
    assert d_reconstructed.blob.tolist() == [[0, 1, 2], [2, 1, 0]]
    assert d_reconstructed.embedding.tolist() == [1.0, 2.0, 3.0]
    assert d_reconstructed.chunks[0].tags == {'hello': 'world'}
    assert d_reconstructed.chunks[0].blob.tolist() == [[3, 4, 5], [5, 4, 3]]
    assert d_reconstructed.chunks[0].embedding.tolist() == [4.0, 5.0, 6.0]
    assert d_reconstructed.matches[0].tags == {'hello': 'world'}
    assert d_reconstructed.matches[0].blob.tolist() == [[6, 7, 8], [8, 7, 6]]
    assert d_reconstructed.matches[0].embedding.tolist() == [7.0, 8.0, 9.0]
Beispiel #11
0
def test_doc_from_dict_cases(d_src, from_str):
    # regular case
    if from_str:
        d_src = json.dumps(d_src)
    d = Document(d_src)
    assert d.tags['hello'] == 'world'
    assert d.mime_type == 'txt'
    assert d.id == '123'
    assert d.parent_id == '456'
Beispiel #12
0
def test_tag_compare_dict():
    d = Document()
    d.tags = {'hey': {'bye': 4}}
    assert d.tags == {'hey': {'bye': 4}}
    assert d.tags.dict() == {'hey': {'bye': 4}}

    d.tags = {'hey': [1, 2]}
    assert d.tags == {'hey': [1, 2]}
    assert d.tags.dict() == {'hey': [1, 2]}
Beispiel #13
0
def test_doc_field_resolver(from_str):
    d_src = {'music_id': '123', 'hello': 'world', 'tags': {'good': 'bye'}}
    if from_str:
        d_src = json.dumps(d_src)
    d = Document(d_src)
    assert d.id != '123'
    assert d.tags['hello'] == 'world'
    assert d.tags['good'] == 'bye'
    assert d.tags['music_id'] == '123'

    d_src = {'music_id': '123', 'hello': 'world', 'tags': {'good': 'bye'}}
    if from_str:
        d_src = json.dumps(d_src)
    d = Document(d_src, field_resolver={'music_id': 'id'})
    assert d.id == '123'
    assert d.tags['hello'] == 'world'
    assert d.tags['good'] == 'bye'
    assert 'music_id' not in d.tags
Beispiel #14
0
def test_document_sparse_attributes_pytorch(torch_sparse_matrix):
    d = Document()
    d.embedding = torch_sparse_matrix
    d.blob = torch_sparse_matrix

    np.testing.assert_array_equal(d.embedding.todense(),
                                  torch_sparse_matrix.to_dense().numpy())
    np.testing.assert_array_equal(d.blob.todense(),
                                  torch_sparse_matrix.to_dense().numpy())
Beispiel #15
0
def test_doc_plot(tmpdir):
    docs = [
        Document(
            id='🐲',
            embedding=np.array([0, 0]),
            tags={
                'guardian': 'Azure Dragon',
                'position': 'East'
            },
        ),
        Document(
            id='🐦',
            embedding=np.array([1, 0]),
            tags={
                'guardian': 'Vermilion Bird',
                'position': 'South'
            },
        ),
        Document(
            id='🐢',
            embedding=np.array([0, 1]),
            tags={
                'guardian': 'Black Tortoise',
                'position': 'North'
            },
        ),
        Document(
            id='🐯',
            embedding=np.array([1, 1]),
            tags={
                'guardian': 'White Tiger',
                'position': 'West'
            },
        ),
    ]

    docs[0].chunks.append(docs[1])
    docs[0].chunks[0].chunks.append(docs[2])
    docs[0].matches.append(docs[3])

    assert docs[0]._mermaid_to_url('svg')
    docs[0].plot(inline_display=True, output=os.path.join(tmpdir, 'doc.svg'))
    assert os.path.exists(os.path.join(tmpdir, 'doc.svg'))
    docs[0].plot()
Beispiel #16
0
def test_sparse_get_set():
    d = Document()
    assert d.content is None
    mat1 = coo_matrix(np.array([1, 2, 3]))
    d.content = mat1
    assert (d.content != mat1).nnz == 0
    mat2 = coo_matrix(np.array([3, 2, 1]))
    assert (d.content != mat2).nnz != 0
    d.blob = mat2
    assert (d.content != mat2).nnz == 0
Beispiel #17
0
def test_uri_get_set():
    a = Document()
    a.uri = 'https://abc.com/a.jpg'
    assert a.uri == 'https://abc.com/a.jpg'
    assert a.mime_type == 'image/jpeg'
    a.uri = 'abcdefg'
    assert a.uri == 'abcdefg'
    a.content = 'abcdefg'
    assert a.text == 'abcdefg'
    assert not a.uri
Beispiel #18
0
def test_cache_content_driver_same_id(tmp_path, test_metas):
    filename = os.path.join(tmp_path, 'DocCache.bin')
    doc1 = Document(id=1)
    doc1.text = 'blabla'
    doc1.update_content_hash()
    docs1 = DocumentSet([doc1])

    doc2 = Document(id=1)
    doc2.text = 'blabla2'
    doc2.update_content_hash()
    docs2 = DocumentSet([doc2])

    driver = MockBaseCacheDriver()

    with DocCache(filename, metas=test_metas, fields=(CONTENT_HASH_KEY,)) as executor:
        driver.attach(executor=executor, runtime=None)
        driver._apply_all(docs1)
        driver._apply_all(docs2)
        assert executor.size == 2
Beispiel #19
0
def test_pb_obj2dict():
    document = Document()
    with document:
        document.text = 'this is text'
        document.tags['id'] = 'id in tags'
        document.tags['inner_dict'] = {'id': 'id in inner_dict'}
        with Document() as chunk:
            chunk.text = 'text in chunk'
            chunk.tags['id'] = 'id in chunk tags'
        document.chunks.add(chunk)
    res = document.get_attrs('text', 'tags', 'chunks')
    assert res['text'] == 'this is text'
    assert res['tags']['id'] == 'id in tags'
    assert res['tags']['inner_dict']['id'] == 'id in inner_dict'
    rcs = list(res['chunks'])
    assert len(rcs) == 1
    assert isinstance(rcs[0], Document)
    assert rcs[0].text == 'text in chunk'
    assert rcs[0].tags['id'] == 'id in chunk tags'
Beispiel #20
0
def test_document_sparse_attributes_tensorflow(tf_sparse_matrix):
    import tensorflow as tf

    d = Document()
    d.embedding = tf_sparse_matrix
    d.blob = tf_sparse_matrix
    np.testing.assert_array_equal(d.embedding.todense(),
                                  tf.sparse.to_dense(tf_sparse_matrix))
    np.testing.assert_array_equal(d.blob.todense(),
                                  tf.sparse.to_dense(tf_sparse_matrix))
Beispiel #21
0
def test_doc_content():
    d = Document()
    assert d.content is None
    d.text = 'abc'
    assert d.content == 'abc'
    c = np.random.random([10, 10])
    d.blob = c
    np.testing.assert_equal(d.content, c)
    d.buffer = b'123'
    assert d.buffer == b'123'
Beispiel #22
0
def test_cache_content_driver_same_id(tmp_path, test_metas):
    filename = tmp_path / 'docidcache.bin'
    doc1 = Document(id=1)
    doc1.text = 'blabla'
    doc1.update_content_hash()
    docs1 = DocumentSet([doc1])

    doc2 = Document(id=1)
    doc2.text = 'blabla2'
    doc2.update_content_hash()
    docs2 = DocumentSet([doc2])

    driver = MockBaseCacheDriver()

    with DocIDCache(filename, metas=test_metas, field=CONTENT_HASH_KEY) as executor:
        driver.attach(executor=executor, runtime=None)
        driver._traverse_apply(docs1)
        driver._traverse_apply(docs2)
        assert executor.size == 2
Beispiel #23
0
def test_doc_update_fields():
    a = Document()
    b = np.random.random([10, 10])
    c = {'tags': 'string', 'tag-tag': {'tags': 123.45}}
    d = [12, 34, 56]
    e = 'text-mod'
    a.update(embedding=b, tags=c, location=d, modality=e)
    np.testing.assert_equal(a.embedding, b)
    assert list(a.location) == d
    assert a.modality == e
    assert MessageToDict(a.tags) == c
Beispiel #24
0
def test_doc_score():
    from jina.types.score import NamedScore

    doc = Document(text='text')

    score = NamedScore(op_name='operation', value=10.0, ref_id=doc.id)
    doc.score = score

    assert doc.score.op_name == 'operation'
    assert doc.score.value == 10.0
    assert doc.score.ref_id == doc.id
Beispiel #25
0
def test_doc_plot():
    docs = [
        Document(
            id='🐲',
            embedding=np.array([0, 0]),
            tags={
                'guardian': 'Azure Dragon',
                'position': 'East'
            },
        ),
        Document(
            id='🐦',
            embedding=np.array([1, 0]),
            tags={
                'guardian': 'Vermilion Bird',
                'position': 'South'
            },
        ),
        Document(
            id='🐢',
            embedding=np.array([0, 1]),
            tags={
                'guardian': 'Black Tortoise',
                'position': 'North'
            },
        ),
        Document(
            id='🐯',
            embedding=np.array([1, 1]),
            tags={
                'guardian': 'White Tiger',
                'position': 'West'
            },
        ),
    ]

    docs[0].chunks.append(docs[1])
    docs[0].chunks[0].chunks.append(docs[2])
    docs[0].matches.append(docs[3])

    assert docs[0]._mermaid_to_url('svg')
Beispiel #26
0
def random_docs(num_docs,
                chunks_per_doc=5,
                embed_dim=10,
                jitter=1) -> Iterator['Document']:
    c_id = 3 * num_docs  # avoid collision with docs
    for j in range(num_docs):
        with Document() as d:
            d.tags['id'] = j
            d.text = b'hello world'
            d.embedding = np.random.random(
                [embed_dim + np.random.randint(0, jitter)])
        for k in range(chunks_per_doc):
            with Document() as c:
                c.text = 'i\'m chunk %d from doc %d' % (c_id, j)
                c.embedding = np.random.random(
                    [embed_dim + np.random.randint(0, jitter)])
                c.tags['id'] = c_id
                c.tags['parent_id'] = j
                c_id += 1
            d.chunks.append(c)
        yield d
Beispiel #27
0
def test_siblings_needs_to_be_set_manually():
    document = Document()
    with document:
        document.text = 'this is text'
        for i in range(3):
            chunk = Document()
            chunk.text = 'text in chunk'
            document.chunks.append(chunk)
    for i in range(3):
        assert document.chunks[i].siblings == 0

    document = Document()
    with document:
        document.text = 'this is text'
        for i in range(3):
            chunk = Document()
            chunk.text = 'text in chunk'
            chunk.siblings = 3
            document.chunks.append(chunk)
    for i in range(3):
        assert document.chunks[i].siblings == 3
Beispiel #28
0
def documents():
    docs = []
    # doc: 1
    # doc: 2
    # doc: 3
    # doc: 4
    # doc: 5
    for idx in range(5):
        with Document(text=str(idx + 1)) as d:
            docs.append(d)

    return docs
Beispiel #29
0
def test_content_hash_not_dependent_on_chunks_or_matches():
    doc1 = Document()
    doc1.content = 'one'
    doc1.update_content_hash()

    doc2 = Document()
    doc2.content = 'one'
    doc2.update_content_hash()
    assert doc1.content_hash == doc2.content_hash

    doc3 = Document()
    doc3.content = 'one'
    for _ in range(3):
        with Document() as m:
            m.content = 'some chunk'
        doc3.chunks.append(m)
    doc3.update_content_hash()
    assert doc1.content_hash == doc3.content_hash

    doc4 = Document()
    doc4.content = 'one'
    for _ in range(3):
        with Document() as m:
            m.content = 'some match'
        doc4.matches.append(m)
    doc4.update_content_hash()
    assert doc1.content_hash == doc4.content_hash
Beispiel #30
0
class JinaRequestModel(BaseModel):
    """
    Jina request model.

    The base model for Jina REST request.
    """

    # To avoid an error while loading the request model schema on swagger, we've added an example.
    data: Union[List[PROTO_TO_PYDANTIC_MODELS.DocumentProto], List[Dict[str,
                                                                        Any]],
                List[str], List[bytes], ] = Field(...,
                                                  example=[Document().dict()])
    request_size: Optional[int] = DEFAULT_REQUEST_SIZE
    mime_type: Optional[str] = ''
    queryset: Optional[List[PROTO_TO_PYDANTIC_MODELS.QueryLangProto]] = None
    data_type: DataInputType = DataInputType.AUTO

    @root_validator(pre=True, allow_reuse=True)
    def add_default_kwargs(cls, kwargs: dict):
        """
        Replicates jina.clients.base.BaseClient.add_default_kwargs for Pydantic

        :param kwargs: arguments passed to the Pydantic model
        :type kwargs: dict
        :return: kwargs
        """
        if ('top_k' in kwargs) and (kwargs['top_k'] is not None):
            # associate all VectorSearchDriver and SliceQL driver to use top_k
            topk_ql = [
                PROTO_TO_PYDANTIC_MODELS.QueryLangProto(
                    **{
                        'name': 'SliceQL',
                        'priority': 1,
                        'parameters': {
                            'end': kwargs['top_k']
                        },
                    }),
                PROTO_TO_PYDANTIC_MODELS.QueryLangProto(
                    **{
                        'name': 'VectorSearchDriver',
                        'priority': 1,
                        'parameters': {
                            'top_k': kwargs['top_k']
                        },
                    }),
            ]
            if 'queryset' not in kwargs:
                kwargs['queryset'] = topk_ql
            else:
                kwargs['queryset'].extend(topk_ql)

        return kwargs