Beispiel #1
0
def test_dbms_keyvalue(tmpdir, test_metas):
    docs = list(get_documents(chunks=False, nr=10, same_content=True))
    ids, vecs, meta = zip(*[(doc.id, doc.embedding,
                             _doc_without_embedding(doc).SerializeToString())
                            for doc in docs])
    save_path = None
    with KeyValueDBMSIndexer(index_filename='dbms',
                             metas=test_metas) as indexer:
        indexer.add(ids, vecs, meta)
        assert indexer.size == len(docs)
        save_path = indexer.save_abspath

    new_docs = list(get_documents(chunks=False, nr=10, same_content=False))
    ids, vecs, meta = zip(*[(doc.id, doc.embedding,
                             _doc_without_embedding(doc).SerializeToString())
                            for doc in new_docs])

    # assert contents update
    with BaseDBMSIndexer.load(save_path) as indexer:
        indexer.update(ids, vecs, meta)
        assert indexer.size == len(docs)

    # assert contents update
    with BaseDBMSIndexer.load(save_path) as indexer:
        indexer.delete([d.id for d in docs])
        assert indexer.size == 0
Beispiel #2
0
def assert_dump_data(dump_path, docs, shards, pea_id):
    size_shard = len(docs) // shards
    size_shard_modulus = len(docs) % shards
    ids_dump, vectors_dump = import_vectors(
        dump_path,
        str(pea_id),
    )
    if pea_id == shards - 1:
        docs_expected = docs[(pea_id) * size_shard:(pea_id + 1) * size_shard +
                             size_shard_modulus]
    else:
        docs_expected = docs[(pea_id) * size_shard:(pea_id + 1) * size_shard]
    print(f'### pea {pea_id} has {len(docs_expected)} docs')

    ids_dump = list(ids_dump)
    vectors_dump = list(vectors_dump)
    np.testing.assert_equal(ids_dump, [d.id for d in docs_expected])
    np.testing.assert_allclose(vectors_dump,
                               [d.embedding for d in docs_expected])

    _, metas_dump = import_metas(
        dump_path,
        str(pea_id),
    )
    metas_dump = list(metas_dump)
    np.testing.assert_equal(
        metas_dump,
        [_doc_without_embedding(d).SerializeToString() for d in docs_expected],
    )

    # assert with Indexers
    # TODO currently metas are only passed to the parent Compound, not to the inner components
    with TimeContext(f'### reloading {len(docs_expected)}'):
        # noinspection PyTypeChecker
        cp: CompoundQueryExecutor = BaseQueryIndexer.load_config(
            'indexer_query.yml',
            pea_id=pea_id,
            metas={
                'workspace': os.path.join(dump_path, 'new_ws'),
                'dump_path': dump_path,
            },
        )
    for c in cp.components:
        assert c.size == len(docs_expected)

    # test with the inner indexers separate from the Compound
    for i, indexer_file in enumerate(
        ['basic/query_np.yml', 'basic/query_kv.yml']):
        indexer = BaseQueryIndexer.load_config(
            indexer_file,
            pea_id=pea_id,
            metas={
                'workspace':
                os.path.realpath(os.path.join(dump_path, f'new_ws-{i}')),
                'dump_path':
                dump_path,
            },
        )
        assert indexer.size == len(docs_expected)
Beispiel #3
0
 def _validate_results_nonempty(resp):
     assert len(resp.docs) == nr_search
     for d in resp.docs:
         if nr_docs < 10:
             assert len(d.matches) == nr_docs
         else:
             # TODO does it return all of them no matter how many?
             assert len(d.matches) > 0
         for m in d.matches:
             assert m.embedding.shape[0] == emb_size
             assert _doc_without_embedding(m).SerializeToString() is not None
             assert 'hello world' in m.text
             assert f'tag data' in m.tags['tag_field']