def test_incremental_indexing_parallel_indexers(random_workspace, restful): total_docs = 1000 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) f = (Flow(restful=restful).add( uses=os.path.join(cur_dir, 'uniq_vectorindexer.yml'), name='inc_vec').add(uses=os.path.join(cur_dir, 'uniq_docindexer.yml'), name='inc_doc', needs=['gateway' ]).add(needs=['inc_vec', 'inc_doc'])) with f: f.index(duplicate_docs[:500]) with f: f.index(duplicate_docs) with BaseExecutor.load((random_workspace / 'inc_vecindexer' / 'vec_idx.bin')) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) assert vector_indexer._size == num_uniq_docs with BaseExecutor.load( (random_workspace / 'inc_docindexer' / 'doc_idx.bin')) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer._size == num_uniq_docs
def test_incremental_indexing_sequential_indexers_content_hash( random_workspace, restful): total_docs = 20 duplicate_docs, _ = get_duplicate_docs(num_docs=total_docs, same_content=False) # because the content is % 2 num_uniq_docs = 10 f = (Flow(restful=restful).add( uses=os.path.join(cur_dir, 'uniq_vectorindexer_content_hash.yml')).add( uses=os.path.join(cur_dir, 'uniq_docindexer_content_hash.yml'))) Client.check_input(duplicate_docs[:10]) Client.check_input(duplicate_docs) with f: f.index(duplicate_docs[:10]) with f: f.index(duplicate_docs) with BaseExecutor.load(random_workspace / 'inc_vecindexer' / 'vec_idx.bin') as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) assert vector_indexer._size == num_uniq_docs with BaseExecutor.load(random_workspace / 'inc_docindexer' / 'doc_idx.bin') as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer._size == num_uniq_docs
def test_incremental_indexing_sequential_indexers_with_shards(random_workspace): total_docs = 1000 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) num_shards = 4 f = (Flow() .add(uses=os.path.join(cur_dir, 'vectorindexer.yml'), uses_before='_unique', shards=num_shards, separated_workspace=True) .add(uses=os.path.join(cur_dir, 'docindexer.yml'), uses_before='_unique', shards=num_shards, separated_workspace=True)) with f: f.index(duplicate_docs[:500]) f.index(duplicate_docs) vect_idx_size = 0 for shard_idx in range(num_shards): save_abspath = (random_workspace / f'vec_idx-{shard_idx + 1}' / 'vec_idx.bin') with BaseExecutor.load(save_abspath) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) vect_idx_size += vector_indexer._size assert vect_idx_size == num_uniq_docs doc_idx_size = 0 for shard_idx in range(num_shards): save_abspath = (random_workspace / f'doc_idx-{shard_idx + 1}' / 'doc_idx.bin') with BaseExecutor.load(save_abspath) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) doc_idx_size += doc_indexer._size assert doc_idx_size == num_uniq_docs
def test_incremental_indexing_sequential_indexers(random_workspace, restful): total_docs = 20 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) f = ( Flow(restful=restful) .add(uses=os.path.join(cur_dir, 'uniq_vectorindexer.yml')) .add(uses=os.path.join(cur_dir, 'uniq_docindexer.yml')) ) Client.check_input(duplicate_docs[:10]) Client.check_input(duplicate_docs) with f: f.index(duplicate_docs[:10]) with f: f.index(duplicate_docs) print(f' random_workspace {random_workspace}') with BaseExecutor.load( random_workspace / 'inc_vecindexer' / 'vec_idx-0' / 'vec_idx.bin' ) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) assert vector_indexer._size == num_uniq_docs with BaseExecutor.load( random_workspace / 'inc_docindexer' / 'doc_idx-0' / 'doc_idx.bin' ) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer._size == num_uniq_docs
def test_unique_indexing_docindexers_before(random_workspace): total_docs = 10 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) f = (Flow().add(uses=os.path.join(cur_dir, 'docindexer.yml'), uses_before='_unique')) with f: f.index(duplicate_docs) with BaseExecutor.load((random_workspace / 'doc_idx.bin')) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer.size == num_uniq_docs
def test_unique_indexing_docindexers(random_workspace, restful, separated_workspace): total_docs = 10 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) f = (Flow(restful=restful) .add(uses=os.path.join(cur_dir, 'uniq_docindexer.yml'), shards=1, separated_workspace=separated_workspace)) with f: f.index(duplicate_docs) with BaseExecutor.load((random_workspace / 'doc_idx.bin')) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer.size == num_uniq_docs
def test_unique_indexing_vecindexers(random_workspace, restful): total_docs = 10 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) f = (Flow(restful=restful) .add(uses=os.path.join(cur_dir, 'uniq_vectorindexer.yml'), name='vec_idx')) with f: f.index(duplicate_docs) with BaseExecutor.load((random_workspace / 'vec_idx.bin')) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) assert vector_indexer.size == num_uniq_docs
def test_unique_indexing_docindexers_before(random_workspace, restful): total_docs = 10 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) # can't use plain _unique because workspace will conflict with other tests f = (Flow(restful=restful) .add(uses=os.path.join(cur_dir, 'docindexer.yml'), uses_before=os.path.join(cur_dir, '_unique_doc.yml'))) with f: f.index(duplicate_docs) with BaseExecutor.load((random_workspace / 'doc_idx.bin')) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer.size == num_uniq_docs
def test_unique_indexing_vecindexers_before(random_workspace): total_docs = 10 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) # can't use plain _unique because workspace will conflict with other tests f = (Flow().add(uses=os.path.join(cur_dir, 'vectorindexer.yml'), uses_before=os.path.join(cur_dir, '_unique_vec.yml'))) with f: f.index(duplicate_docs) with BaseExecutor.load( (random_workspace / 'vec_idx.bin')) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) assert vector_indexer.size == num_uniq_docs
def test_incremental_indexing_parallel_indexers_with_shards(random_workspace, restful): total_docs = 1000 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) num_shards = 4 # can't use plain _unique in uses_before because workspace will conflict with other f = ( Flow(restful=restful) .add( uses=os.path.join(cur_dir, 'vectorindexer.yml'), uses_before=os.path.join(cur_dir, '_unique_vec.yml'), shards=num_shards, name='inc_vec', ) .add( uses=os.path.join(cur_dir, 'docindexer.yml'), uses_before=os.path.join(cur_dir, '_unique_doc.yml'), shards=num_shards, name='inc_doc', needs=['gateway'], ) .add(needs=['inc_vec', 'inc_doc']) ) with f: f.index(duplicate_docs[:500]) with f: f.index(duplicate_docs) vect_idx_size = 0 for shard_idx in range(num_shards): save_abspath = random_workspace / f'vec_idx-{shard_idx}' / 'vec_idx.bin' with BaseExecutor.load(save_abspath) as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) vect_idx_size += vector_indexer._size assert vect_idx_size == num_uniq_docs doc_idx_size = 0 for shard_idx in range(num_shards): save_abspath = random_workspace / f'doc_idx-{shard_idx}' / 'doc_idx.bin' with BaseExecutor.load(save_abspath) as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) doc_idx_size += doc_indexer._size assert doc_idx_size == num_uniq_docs
def test_incremental_indexing_sequential_indexers(random_workspace): total_docs = 20 duplicate_docs, num_uniq_docs = get_duplicate_docs(num_docs=total_docs) f = (Flow().add(uses=os.path.join(cur_dir, 'uniq_vectorindexer.yml')).add( uses=os.path.join(cur_dir, 'uniq_docindexer.yml'))) with f: f.index(duplicate_docs[:10]) f.index(duplicate_docs) with BaseExecutor.load(random_workspace / 'vec_idx.bin') as vector_indexer: assert isinstance(vector_indexer, NumpyIndexer) assert vector_indexer._size == num_uniq_docs with BaseExecutor.load(random_workspace / 'doc_idx.bin') as doc_indexer: assert isinstance(doc_indexer, BinaryPbIndexer) assert doc_indexer._size == num_uniq_docs