Ejemplo n.º 1
0
def test_incremental_indexing_parallel_indexers(random_workspace, restful):
    total_docs = 1000
    duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs)

    f = (Flow(restful=restful).add(
        uses=os.path.join(cur_dir, 'uniq_vectorindexer.yml'),
        name='inc_vec').add(uses=os.path.join(cur_dir, 'uniq_docindexer.yml'),
                            name='inc_doc',
                            needs=['gateway'
                                   ]).add(needs=['inc_vec', 'inc_doc']))
    with f:
        f.index(duplicate_docs[:500])

    with f:
        f.index(duplicate_docs)

    with BaseExecutor.load((random_workspace / 'inc_vecindexer' /
                            'vec_idx.bin')) as vector_indexer:
        assert isinstance(vector_indexer, NumpyIndexer)
        assert vector_indexer._size == num_uniq_docs

    with BaseExecutor.load(
        (random_workspace / 'inc_docindexer' / 'doc_idx.bin')) as doc_indexer:
        assert isinstance(doc_indexer, BinaryPbIndexer)
        assert doc_indexer._size == num_uniq_docs
Ejemplo n.º 2
0
def test_incremental_indexing_sequential_indexers_content_hash(
        random_workspace, restful):
    total_docs = 20
    duplicate_docs, _ = get_duplicate_docs(num_docs=total_docs,
                                           same_content=False)
    # because the content is % 2
    num_uniq_docs = 10

    f = (Flow(restful=restful).add(
        uses=os.path.join(cur_dir, 'uniq_vectorindexer_content_hash.yml')).add(
            uses=os.path.join(cur_dir, 'uniq_docindexer_content_hash.yml')))

    Client.check_input(duplicate_docs[:10])
    Client.check_input(duplicate_docs)

    with f:
        f.index(duplicate_docs[:10])

    with f:
        f.index(duplicate_docs)

    with BaseExecutor.load(random_workspace / 'inc_vecindexer' /
                           'vec_idx.bin') as vector_indexer:
        assert isinstance(vector_indexer, NumpyIndexer)
        assert vector_indexer._size == num_uniq_docs

    with BaseExecutor.load(random_workspace / 'inc_docindexer' /
                           'doc_idx.bin') as doc_indexer:
        assert isinstance(doc_indexer, BinaryPbIndexer)
        assert doc_indexer._size == num_uniq_docs
Ejemplo n.º 3
0
def test_incremental_indexing_sequential_indexers_with_shards(random_workspace):
    total_docs = 1000
    duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs)

    num_shards = 4
    f = (Flow()
         .add(uses=os.path.join(cur_dir, 'vectorindexer.yml'),
              uses_before='_unique',
              shards=num_shards,
              separated_workspace=True)
         .add(uses=os.path.join(cur_dir, 'docindexer.yml'),
              uses_before='_unique',
              shards=num_shards,
              separated_workspace=True))

    with f:
        f.index(duplicate_docs[:500])
        f.index(duplicate_docs)

    vect_idx_size = 0
    for shard_idx in range(num_shards):
        save_abspath = (random_workspace / f'vec_idx-{shard_idx + 1}' / 'vec_idx.bin')
        with BaseExecutor.load(save_abspath) as vector_indexer:
            assert isinstance(vector_indexer, NumpyIndexer)
            vect_idx_size += vector_indexer._size
    assert vect_idx_size == num_uniq_docs

    doc_idx_size = 0
    for shard_idx in range(num_shards):
        save_abspath = (random_workspace / f'doc_idx-{shard_idx + 1}' / 'doc_idx.bin')
        with BaseExecutor.load(save_abspath) as doc_indexer:
            assert isinstance(doc_indexer, BinaryPbIndexer)
            doc_idx_size += doc_indexer._size
    assert doc_idx_size == num_uniq_docs
Ejemplo n.º 4
0
def test_incremental_indexing_sequential_indexers(random_workspace, restful):
    total_docs = 20
    duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs)

    f = (
        Flow(restful=restful)
        .add(uses=os.path.join(cur_dir, 'uniq_vectorindexer.yml'))
        .add(uses=os.path.join(cur_dir, 'uniq_docindexer.yml'))
    )

    Client.check_input(duplicate_docs[:10])
    Client.check_input(duplicate_docs)

    with f:
        f.index(duplicate_docs[:10])

    with f:
        f.index(duplicate_docs)

    print(f' random_workspace {random_workspace}')

    with BaseExecutor.load(
        random_workspace / 'inc_vecindexer' / 'vec_idx-0' / 'vec_idx.bin'
    ) as vector_indexer:
        assert isinstance(vector_indexer, NumpyIndexer)
        assert vector_indexer._size == num_uniq_docs

    with BaseExecutor.load(
        random_workspace / 'inc_docindexer' / 'doc_idx-0' / 'doc_idx.bin'
    ) as doc_indexer:
        assert isinstance(doc_indexer, BinaryPbIndexer)
        assert doc_indexer._size == num_uniq_docs
Ejemplo n.º 5
0
def test_unique_indexing_docindexers_before(random_workspace):
    total_docs = 10
    duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs)

    f = (Flow().add(uses=os.path.join(cur_dir, 'docindexer.yml'),
                    uses_before='_unique'))

    with f:
        f.index(duplicate_docs)

    with BaseExecutor.load((random_workspace / 'doc_idx.bin')) as doc_indexer:
        assert isinstance(doc_indexer, BinaryPbIndexer)
        assert doc_indexer.size == num_uniq_docs
Ejemplo n.º 6
0
def test_unique_indexing_docindexers(random_workspace, restful, separated_workspace):
    total_docs = 10
    duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs)

    f = (Flow(restful=restful)
         .add(uses=os.path.join(cur_dir, 'uniq_docindexer.yml'), shards=1, separated_workspace=separated_workspace))

    with f:
        f.index(duplicate_docs)

    with BaseExecutor.load((random_workspace / 'doc_idx.bin')) as doc_indexer:
        assert isinstance(doc_indexer, BinaryPbIndexer)
        assert doc_indexer.size == num_uniq_docs
Ejemplo n.º 7
0
def test_unique_indexing_vecindexers(random_workspace, restful):
    total_docs = 10
    duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs)

    f = (Flow(restful=restful)
         .add(uses=os.path.join(cur_dir, 'uniq_vectorindexer.yml'), name='vec_idx'))

    with f:
        f.index(duplicate_docs)

    with BaseExecutor.load((random_workspace / 'vec_idx.bin')) as vector_indexer:
        assert isinstance(vector_indexer, NumpyIndexer)
        assert vector_indexer.size == num_uniq_docs
Ejemplo n.º 8
0
def test_unique_indexing_docindexers_before(random_workspace, restful):
    total_docs = 10
    duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs)

    # can't use plain _unique because workspace will conflict with other tests
    f = (Flow(restful=restful)
         .add(uses=os.path.join(cur_dir, 'docindexer.yml'),
              uses_before=os.path.join(cur_dir, '_unique_doc.yml')))

    with f:
        f.index(duplicate_docs)

    with BaseExecutor.load((random_workspace / 'doc_idx.bin')) as doc_indexer:
        assert isinstance(doc_indexer, BinaryPbIndexer)
        assert doc_indexer.size == num_uniq_docs
Ejemplo n.º 9
0
def test_unique_indexing_vecindexers_before(random_workspace):
    total_docs = 10
    duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs)

    # can't use plain _unique because workspace will conflict with other tests
    f = (Flow().add(uses=os.path.join(cur_dir, 'vectorindexer.yml'),
                    uses_before=os.path.join(cur_dir, '_unique_vec.yml')))

    with f:
        f.index(duplicate_docs)

    with BaseExecutor.load(
        (random_workspace / 'vec_idx.bin')) as vector_indexer:
        assert isinstance(vector_indexer, NumpyIndexer)
        assert vector_indexer.size == num_uniq_docs
Ejemplo n.º 10
0
def test_incremental_indexing_parallel_indexers_with_shards(random_workspace, restful):
    total_docs = 1000
    duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs)

    num_shards = 4

    # can't use plain _unique in uses_before because workspace will conflict with other
    f = (
        Flow(restful=restful)
        .add(
            uses=os.path.join(cur_dir, 'vectorindexer.yml'),
            uses_before=os.path.join(cur_dir, '_unique_vec.yml'),
            shards=num_shards,
            name='inc_vec',
        )
        .add(
            uses=os.path.join(cur_dir, 'docindexer.yml'),
            uses_before=os.path.join(cur_dir, '_unique_doc.yml'),
            shards=num_shards,
            name='inc_doc',
            needs=['gateway'],
        )
        .add(needs=['inc_vec', 'inc_doc'])
    )

    with f:
        f.index(duplicate_docs[:500])

    with f:
        f.index(duplicate_docs)

    vect_idx_size = 0
    for shard_idx in range(num_shards):
        save_abspath = random_workspace / f'vec_idx-{shard_idx}' / 'vec_idx.bin'
        with BaseExecutor.load(save_abspath) as vector_indexer:
            assert isinstance(vector_indexer, NumpyIndexer)
            vect_idx_size += vector_indexer._size
    assert vect_idx_size == num_uniq_docs

    doc_idx_size = 0
    for shard_idx in range(num_shards):
        save_abspath = random_workspace / f'doc_idx-{shard_idx}' / 'doc_idx.bin'
        with BaseExecutor.load(save_abspath) as doc_indexer:
            assert isinstance(doc_indexer, BinaryPbIndexer)
            doc_idx_size += doc_indexer._size
    assert doc_idx_size == num_uniq_docs
Ejemplo n.º 11
0
def test_incremental_indexing_sequential_indexers(random_workspace):
    total_docs = 20
    duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs)

    f = (Flow().add(uses=os.path.join(cur_dir, 'uniq_vectorindexer.yml')).add(
        uses=os.path.join(cur_dir, 'uniq_docindexer.yml')))

    with f:
        f.index(duplicate_docs[:10])
        f.index(duplicate_docs)

    with BaseExecutor.load(random_workspace / 'vec_idx.bin') as vector_indexer:
        assert isinstance(vector_indexer, NumpyIndexer)
        assert vector_indexer._size == num_uniq_docs

    with BaseExecutor.load(random_workspace / 'doc_idx.bin') as doc_indexer:
        assert isinstance(doc_indexer, BinaryPbIndexer)
        assert doc_indexer._size == num_uniq_docs