Python DocumentArrayMemmap Examples, jina.types.arrays.memmap.DocumentArrayMemmap Python Examples

Example #1

0

Show file

File: test_buffer.py Project: florian-hoenicke/jina

def test_buffer_dam_add_or_update(tmpdir):
    dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=6)
    docs = list(random_docs(8))
    dam.extend(docs[:5])

    doc1 = docs[0]
    doc1.content = 'new'

    # doc1 already exists => update
    dam.buffer_pool.add_or_update(doc1.id, doc1)
    assert dam[0].content == doc1.content
    assert len(dam.buffer_pool.buffer) == 5

    # doc does not exist => add to buffer
    dam.buffer_pool.add_or_update(docs[5].id, docs[5])
    assert len(dam.buffer_pool.buffer) == 6

    # buffer is full => remove the LRU (docs[1], because docs[0] was used before)
    dam.buffer_pool.add_or_update(docs[6].id, docs[6])
    assert docs[6].id in dam.buffer_pool
    assert docs[1].id not in dam.buffer_pool

    del dam.buffer_pool[docs[4].id]

    # spot number 4 becomes empty
    assert 4 in dam.buffer_pool._empty
    dam.buffer_pool.add_or_update(docs[7].id, docs[7])
    assert dam.buffer_pool.doc_map[docs[7].id][0] == 4

Example #2

0

Show file

File: my_executors.py Project: JoanFM/jina

class MyIndexer(Executor):
    """Simple indexer class """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._docs = DocumentArrayMemmap(self.workspace + '/indexer')

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        self._docs.extend(docs)

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', **kwargs):
        """Append best matches to each document in docs

        :param docs: documents that are searched
        :param parameters: dictionary of pairs (parameter,value)
        :param kwargs: other keyword arguments
        """
        docs.match(
            self._docs,
            metric='cosine',
            normalization=(1, 0),
            limit=1,
        )

Example #3

0

Show file

def test_setter_wrong_len(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for _ in range(100)])
    tags = [{'1': 2}]

    with pytest.raises(ValueError, match='the number of tags in the'):
        dam.tags = tags

Example #4

0

Show file

File: test_memmap.py Project: paddlelaw/jina

def test_blobs_wrong_len(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for x in range(100)])
    blobs = np.ones((2, 10, 10))

    with pytest.raises(ValueError):
        dam.blobs = blobs

Example #5

0

Show file

File: test_memmap.py Project: paddlelaw/jina

def test_texts_wrong_len(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for _ in range(100)])
    texts = ['hello']

    with pytest.raises(ValueError):
        dam.texts = texts

Example #6

0

Show file

File: my_executors.py Project: vishalbelsare/jina

class MyIndexer(Executor):
    """
    Executor with basic exact search using cosine distance
    """
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._docs = DocumentArrayMemmap(self.workspace + '/indexer')

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        """Extend self._docs

        :param docs: DocumentArray containing Documents
        :param kwargs: other keyword arguments
        """
        self._docs.extend(docs)

    @requests(on=['/search', '/eval'])
    def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs):
        """Append best matches to each document in docs

        :param docs: documents that are searched
        :param parameters: dictionary of pairs (parameter,value)
        :param kwargs: other keyword arguments
        """
        docs.match(
            self._docs,
            metric='cosine',
            normalization=(1, 0),
            limit=int(parameters['top_k']),
        )

Example #7

0

Show file

def test_scipy_dist(docarrays_for_embedding_distance_computation,
                    normalization, metric, tmpdir, only_id):
    D1, D2 = docarrays_for_embedding_distance_computation
    D1_ = copy.deepcopy(D1)
    D2_ = copy.deepcopy(D2)
    D1.match(D2,
             metric=metric,
             limit=3,
             normalization=normalization,
             use_scipy=True)
    values_docarray = [m.scores[metric].value for d in D1 for m in d.matches]

    D2memmap = DocumentArrayMemmap(tmpdir)
    D2memmap.extend(D2_)
    D1_.match(
        D2memmap,
        metric=metric,
        limit=3,
        normalization=normalization,
        use_scipy=True,
        only_id=only_id,
    )
    values_docarraymemmap = [
        m.scores[metric].value for d in D1_ for m in d.matches
    ]

    np.testing.assert_equal(values_docarray, values_docarraymemmap)

Example #8

0

Show file

def test_match_handle_different_limit(get_two_docarray, limit, tmpdir):
    da1, da2 = get_two_docarray
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend(da2)
    da1.match(dam, limit=limit)
    expected_length = limit if limit not in [None, -1] else len(da2)
    assert len(da1[0].matches) == expected_length

Example #9

0

Show file

def test_embeddings_wrong_len(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for x in range(100)])
    embeddings = np.ones((2, 10, 10))

    with pytest.raises(ValueError, match='the number of rows in the'):
        dam.embeddings = embeddings

Example #10

0

Show file

File: test_memmap.py Project: paddlelaw/jina

def test_texts_getter_dam(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document(text='hello') for _ in range(100)])
    assert len(dam.texts) == 100
    t1 = dam.texts
    t2 = dam.get_attributes('text')
    assert t1 == t2

Example #11

0

Show file

File: test_memmap.py Project: paddlelaw/jina

def test_blobs_setter_dam(tmpdir):
    blobs = np.random.random((100, 10, 10))
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for _ in blobs])
    dam.blobs = blobs
    np.testing.assert_almost_equal(dam.blobs, blobs)
    for x, doc in zip(blobs, dam):
        np.testing.assert_almost_equal(x, doc.blob)

Example #12

0

Show file

File: test_buffer.py Project: florian-hoenicke/jina

def test_buffer_dam_clear(tmpdir):
    dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=5)
    docs = list(random_docs(5))
    dam.extend(docs)

    dam.buffer_pool.clear()
    for doc in docs:
        assert doc.id not in dam.buffer_pool

Example #13

0

Show file

File: test_memmap.py Project: paddlelaw/jina

def test_traverse(tmpdir, mocker):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend(random_docs(100))
    mock = mocker.Mock()
    for c in dam.traverse_flat(['c']):
        assert c.granularity == 1
        mock()
    mock.assert_called()

Example #14

0

Show file

File: test_memmap.py Project: paddlelaw/jina

def test_embeddings_setter_dam(tmpdir):
    emb = np.random.random((100, 128))
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for _ in range(100)])
    dam.embeddings = emb
    np.testing.assert_almost_equal(dam.embeddings, emb)

    for x, doc in zip(emb, dam):
        np.testing.assert_almost_equal(x, doc.embedding)

Example #15

0

Show file

def test_tags_setter_dam(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    tags = [{'a': 2, 'c': 'd'} for _ in range(100)]
    dam.extend([Document() for _ in range(100)])
    dam.tags = tags
    assert dam.tags == tags

    for x, doc in zip(tags, dam):
        assert x == doc.tags

Example #16

0

Show file

File: test_memmap.py Project: paddlelaw/jina

def test_memmap_update_in_memory(tmpdir):
    dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=100)
    candidates = list(random_docs(100))
    dam.extend(candidates)
    for idx, candidate in enumerate(candidates):
        candidate.content = f'new content {idx}'

    for idx, doc in enumerate(dam):
        assert doc.content == f'new content {idx}'

Example #17

0

Show file

File: test_memmap.py Project: paddlelaw/jina

def test_texts_setter_dam(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for _ in range(100)])
    texts = ['text' for _ in range(100)]
    dam.texts = texts
    assert dam.texts == texts

    for x, doc in zip(texts, dam):
        assert x == doc.text

Example #18

0

Show file

File: test_memmap.py Project: paddlelaw/jina

def test_memmap_update_document(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    candidates = list(random_docs(100))
    dam.extend(candidates)
    for idx, candidate in enumerate(candidates):
        candidate.content = f'new content {idx}'
        dam[idx] = candidate

    for idx, doc in enumerate(dam):
        assert doc.content == f'new content {idx}'

Example #19

0

Show file

File: test_memmap.py Project: paddlelaw/jina

def test_memmap_buffer_synched(tmpdir):
    docs = list(random_docs(100))
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend(docs[:50])

    for i, doc in enumerate(docs[50:]):
        dam[i] = doc
        assert dam._buffer_pool[doc.id].id == dam[i].id
        doc.content = 'new'
        assert dam[doc.id].content == 'new'

Example #20

0

Show file

File: test_memmap.py Project: paddlelaw/jina

def test_error(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.clear()
    with pytest.raises(KeyError):
        dam['12']
    with pytest.raises(IndexError):
        dam[1]
    with pytest.raises(IndexError):
        del dam[1]
    with pytest.raises(KeyError):
        del dam['12']

Example #21

0

Show file

File: test_memmap.py Project: paddlelaw/jina

def test_shuffle(tmpdir):
    da = DocumentArrayMemmap(tmpdir)
    docs = list(random_docs(100))
    da.extend(docs)
    shuffled = da.shuffle()
    assert len(shuffled) == len(da)
    assert isinstance(shuffled, DocumentArray)
    ids_before_shuffle = [d.id for d in da]
    ids_after_shuffle = [d.id for d in shuffled]
    assert ids_before_shuffle != ids_after_shuffle
    assert sorted(ids_before_shuffle) == sorted(ids_after_shuffle)

Example #22

0

Show file

File: test_memmap.py Project: paddlelaw/jina

def test_convert_dm_to_dam(tmpdir, mocker):
    dam = DocumentArrayMemmap(tmpdir)
    da = DocumentArray(random_docs(100))
    dam.extend(da)
    da.clear()
    mock = mocker.Mock()
    for d in dam:
        assert d
        mock()
    mock.assert_called()
    assert len(da) == 0
    assert len(dam) == 100

Example #23

0

Show file

File: test_buffer.py Project: florian-hoenicke/jina

def test_buffer_dam_delete(tmpdir):
    dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=5)
    docs = list(random_docs(6))
    dam.extend(docs)

    first_doc = docs[0]

    # the first element should be out of buffer
    with pytest.raises(KeyError):
        del dam.buffer_pool[first_doc.id]

    # no exception raised
    dam.buffer_pool.delete_if_exists(first_doc.id)

Example #24

0

Show file

def doc_lists_to_doc_arrays(doc_lists, tmpdir, first_memmap, second_memmap,
                            buffer_pool_size):
    doc_list1, doc_list2 = doc_lists

    tmpdir1, tmpdir2 = tmpdir / '1', tmpdir / '2'

    D1 = (DocumentArray() if not first_memmap else DocumentArrayMemmap(
        tmpdir1, buffer_pool_size=buffer_pool_size))
    D1.extend(doc_list1)
    D2 = (DocumentArray() if not second_memmap else DocumentArrayMemmap(
        tmpdir2, buffer_pool_size=buffer_pool_size))
    D2.extend(doc_list2)
    return D1, D2

Example #25

0

Show file

File: test_helper.py Project: JoanFM/jina

def test_batch_iterator_dam(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    for i in range(4):
        dam.append(Document(id=i))
    bi = batch_iterator(dam, 2)
    expected_iterator = iter(range(4))
    for batch in bi:
        for doc in batch:
            assert int(doc.id) == next(expected_iterator)

    # expect that expected_iterator is totally consumed
    with pytest.raises(StopIteration):
        next(expected_iterator)

Example #26

0

Show file

File: test_buffer.py Project: florian-hoenicke/jina

def test_buffer_dam_getitem(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    docs = list(random_docs(10))
    dam.extend(docs)
    for i, doc in enumerate(docs):
        # assert same doc when getting by key
        assert dam.buffer_pool[doc.id].content_hash == doc.content_hash
        assert dam.buffer_pool[doc.id].id == doc.id

    with pytest.raises(TypeError):
        dam.buffer_pool[1:5]

    with pytest.raises(TypeError):
        dam.buffer_pool[0]

Example #27

0

Show file

class KeyValueIndexer(Executor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._docs = DocumentArrayMemmap(self.workspace + '/kv-idx')

    @requests(on='/index')
    def index(self, docs: DocumentArray, **kwargs):
        self._docs.extend(docs)

    @requests(on='/search')
    def query(self, docs: DocumentArray, **kwargs):
        for doc in docs:
            for match in doc.matches:
                extracted_doc = self._docs[match.parent_id]
                match.update(extracted_doc)

Example #28

0

Show file

File: test_memmap.py Project: paddlelaw/jina

def memmap_for_split(tmpdir):
    da = DocumentArrayMemmap(tmpdir)
    da.append(Document(tags={'category': 'c'}))
    da.append(Document(tags={'category': 'c'}))
    da.append(Document(tags={'category': 'b'}))
    da.append(Document(tags={'category': 'a'}))
    da.append(Document(tags={'category': 'a'}))
    return da

Example #29

0

Show file

File: test_memmap.py Project: paddlelaw/jina

def test_memmap_delete_by_slice(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    candidates = list(random_docs(100))
    for d in candidates:
        d.id = f'id_{d.id}'
    dam.extend(candidates)
    assert len(dam) == 100
    del dam[-5:]
    assert len(dam) == 95
    del dam[:5]
    assert len(dam) == 90

    for candidate in candidates[:5] + candidates[-5:]:
        for d in dam:
            assert d.id != candidate.id

Example #30

0

Show file

File: my_executors.py Project: paddlelaw/jina

class DocVectorIndexer(Executor):
    def __init__(self, index_file_name: str, **kwargs):
        super().__init__(**kwargs)
        self._docs = DocumentArrayMemmap(self.workspace + f'/{index_file_name}')

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        self._docs.extend(docs)

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs):
        docs.match(
            self._docs,
            metric='cosine',
            normalization=(1, 0),
            limit=int(parameters['top_k']),
        )