Example #1
0
def validate_index_size(num_indexed_docs):
    from jina.executors.compound import CompoundExecutor

    path_compound = Path(
        CompoundExecutor.get_component_workspace_from_compound_workspace(
            os.environ['JINA_REST_DIR'], 'chunk_indexer', 0))
    path = Path(os.environ['JINA_REST_DIR'])
    bin_files = list(path_compound.glob('*.bin')) + list(path.glob('*.bin'))
    assert len(bin_files) > 0
    for index_file in bin_files:
        index = BaseIndexer.load(str(index_file))
        assert index.size == num_indexed_docs
Example #2
0
def check_indexers_size(chunks, nr_docs, field, tmp_path, same_content, shards,
                        post_op):
    cache_indexer_path = tmp_path / 'cache.bin'
    cache_full_size = 0
    with BaseIndexer.load(cache_indexer_path) as cache:
        assert isinstance(cache, DocIDCache)
        cache_full_size = cache.size
        print(f'cache size {cache.size}')

    for indexer_fname in [KV_IDX_FILENAME, VEC_IDX_FILENAME]:
        indexers_full_size = 0
        for i in range(shards):
            from jina.executors.compound import CompoundExecutor
            compound_name = 'inc_docindexer' if KV_IDX_FILENAME in indexer_fname else 'inc_vecindexer'
            workspace_folder = CompoundExecutor.get_component_workspace_from_compound_workspace(
                tmp_path, compound_name, i + 1 if shards > 1 else 0)
            indexer_path = os.path.join(
                BaseIndexer.get_shard_workspace(
                    workspace_folder=workspace_folder,
                    workspace_name=indexer_fname.rstrip('.bin'),
                    pea_id=i + 1 if shards > 1 else 0), f'{indexer_fname}')

            # in the configuration of content-hash / same_content=True
            # there aren't enough docs to satisfy batch size, only 1 shard will have it
            if os.path.exists(indexer_path):
                with BaseIndexer.load(indexer_path) as indexer:
                    if indexer_fname == KV_IDX_FILENAME:
                        assert isinstance(indexer, BinaryPbIndexer)
                    else:
                        assert isinstance(indexer, NumpyIndexer)
                    indexers_full_size += indexer.size

        if post_op == 'delete':
            assert indexers_full_size == 0
            assert cache_full_size == 0
        else:
            if field == 'content_hash' and same_content:
                if chunks > 0:
                    # one content from Doc, one from chunk
                    expected = 2
                    assert indexers_full_size == expected
                    assert cache_full_size == 2
                else:
                    assert indexers_full_size == 1
                    assert cache_full_size == 1
            else:
                nr_expected = (nr_docs + chunks * nr_docs) * 2 if post_op == 'index2' \
                    else nr_docs + chunks * nr_docs
                assert indexers_full_size == nr_expected
                assert cache_full_size == nr_expected