def test_normal(docs): NUM_REPLICAS = 3 NUM_SHARDS = 2 doc_id_path = collections.OrderedDict() def handle_search_result(resp): for doc in resp.data.docs: doc_id_path[int(doc.id)] = (doc.tags['replica'], doc.tags['shard']) flow = Flow().add( name='pod1', uses=DummyMarkExecutor, replicas=NUM_REPLICAS, parallel=NUM_SHARDS, ) with flow: flow.search(inputs=docs, request_size=1, on_done=handle_search_result) assert len(doc_id_path.keys()) == len(docs) num_used_replicas = len(set(map(lambda x: x[0], doc_id_path.values()))) assert num_used_replicas == NUM_REPLICAS shards = collections.defaultdict(list) for replica, shard in doc_id_path.values(): shards[replica].append(shard) assert len(shards.keys()) == NUM_REPLICAS for shard_list in shards.values(): assert len(set(shard_list)) == NUM_SHARDS
def test_sparse_pipeline(mocker, docs_to_index): def validate(response): assert len(response.data.docs) == 10 for doc in response.data.docs: for i, match in enumerate(doc.matches): assert match.id == docs_to_index[i].id assert isinstance(match.embedding, sparse.coo_matrix) f = Flow().add(uses=DummyCSRSparseIndexEncoder) mock = mocker.Mock() error_mock = mocker.Mock() with f: f.index( inputs=docs_to_index, on_done=mock, ) f.search( inputs=docs_to_index[0], parameters={ 'doc': docs_to_index[0], 'top_k': 1 }, on_done=mock, on_error=error_mock, ) mock.assert_called_once() validate_callback(mock, validate) error_mock.assert_not_called()
def test_normal(docs): NUM_REPLICAS = 3 NUM_SHARDS = 2 doc_id_path = collections.OrderedDict() def handle_search_result(resp): for doc in resp.data.docs: if int(doc.id) not in doc_id_path: doc_id_path[int(doc.id)] = [] doc_id_path[int(doc.id)].append((doc.tags['replica'], doc.tags['shard'])) flow = Flow().add( name='executor1', uses=DummyMarkExecutor, replicas=NUM_REPLICAS, shards=NUM_SHARDS, ) with flow: flow.search(inputs=docs, request_size=1, on_done=handle_search_result) assert len(doc_id_path.keys()) == len(docs) replica_shards = [ tag_item for tag_items in doc_id_path.values() for tag_item in tag_items ] replicas = [r for r, s in replica_shards] shards = [s for r, s in replica_shards] assert len(set(replicas)) == NUM_REPLICAS assert len(set(shards)) == NUM_SHARDS
def test_scale_after_rolling_update( docs, replicas, scale_to, expected_before_scale, expected_after_scale ): flow = Flow().add( name='executor1', uses=DummyMarkExecutor, replicas=replicas, ) with flow: ret1 = flow.search(docs, return_results=True, request_size=1) flow.rolling_update('executor1', None) flow.scale('executor1', replicas=scale_to) ret2 = flow.search(docs, return_results=True, request_size=1) replica_ids = set() for r in ret1: for replica_id in r.docs.get_attributes('tags__replica'): replica_ids.add(replica_id) assert replica_ids == expected_before_scale replica_ids = set() for r in ret2: for replica_id in r.docs.get_attributes('tags__replica'): replica_ids.add(replica_id) assert replica_ids == expected_after_scale
def test_override_uses_with(docs): flow = Flow().add( name='executor1', uses=UpdateExecutor, replicas=2, parallel=3, ) with flow: # test rolling update does not hang ret1 = flow.search(docs, return_results=True) flow.rolling_update( 'executor1', dump_path='/tmp/dump_path2/', uses_with={'argument1': 'version2', 'argument2': 'version2'}, ) ret2 = flow.search(docs, return_results=True) assert len(ret1) > 0 assert len(ret1[0].docs) > 0 for doc in ret1[0].docs: assert doc.tags['dump_path'] == '/tmp/dump_path1/' assert doc.tags['arg1'] == 'version1' assert doc.tags['arg2'] == 'version1' assert len(ret2) > 0 assert len(ret2[0].docs) > 0 for doc in ret2[0].docs: assert doc.tags['dump_path'] == '/tmp/dump_path2/' assert doc.tags['arg1'] == 'version2' assert doc.tags['arg2'] == 'version2'
def test_simple_run(docs): flow = Flow().add( name='pod1', replicas=2, parallel=3, ) with flow: # test rolling update does not hang flow.search(docs) flow.rolling_update('pod1', None) flow.search(docs)
def test_simple_run(docs): flow = Flow().add( name='executor1', replicas=2, shards=3, ) with flow: # test rolling update does not hang flow.search(docs) flow.rolling_update('executor1', None) flow.search(docs)
def test_override_config_params_parallel(): flow = Flow(return_results=True).add( uses=os.path.join(cur_dir, 'default_config.yml'), uses_with={'param1': 50, 'param2': 30}, uses_metas={'workspace': 'different_workspace'}, parallel=2, ) with flow: resps = flow.search(inputs=[Document()], return_results=True) doc = resps[0].docs[0] assert doc.tags['param1'] == 50 assert doc.tags['param2'] == 30 assert doc.tags['param3'] == 10 # not overriden assert doc.tags['name'] == 'name' # not override assert doc.tags['workspace'] == 'different_workspace'
def test_override_config_params_shards(docker_image): flow = Flow(return_results=True).add( uses='docker://override-config-test', uses_with={'param1': 50, 'param2': 30}, uses_metas={'workspace': 'different_workspace'}, shards=2, ) with flow: resps = flow.search(inputs=[Document()], return_results=True) doc = resps[0].docs[0] assert doc.tags['param1'] == 50 assert doc.tags['param2'] == 30 assert doc.tags['param3'] == 10 # not overriden assert doc.tags['name'] == 'name' # not override assert doc.tags['workspace'] == 'different_workspace'
def test_custom_dockerfile(): f = Flow().add( uses='DummyRedisIndexer', py_modules=[os.path.join(cur_dir, 'redis_executor.py')], upload_files=[ os.path.join(cur_dir, '../../daemon/unit/models/good_ws_custom_dockerfile'), ], host='localhost:8000', ) with f: f.index(inputs=(Document(text=f'{i}', embedding=np.random.rand(2, 3)) for i in range(5)), ) resp = f.search(inputs=[Document(text='3')], return_results=True) assert resp[0].docs[0].matches[0].text == '3' assert resp[0].docs[0].matches[0].embedding.shape == (2, 3)
def _benchmark_qps() -> Dict[str, float]: """Benchmark Jina Core Indexing and Query. Returns: A dict mapping keys """ args = set_hw_parser().parse_args() args.workdir = os.path.join(os.getcwd(), 'original') args.num_query = 4096 targets = { 'index-labels': { 'url': args.index_labels_url, 'filename': os.path.join(args.workdir, 'index-labels'), }, 'query-labels': { 'url': args.query_labels_url, 'filename': os.path.join(args.workdir, 'query-labels'), }, 'index': { 'url': args.index_data_url, 'filename': os.path.join(args.workdir, 'index-original'), }, 'query': { 'url': args.query_data_url, 'filename': os.path.join(args.workdir, 'query-original'), }, } # download the data Path(args.workdir).mkdir(parents=True, exist_ok=True) download_data(targets, args.download_proxy) try: f = Flow().add(uses=MyEncoder).add(workspace='./', uses=MyIndexer) with f: # do index log.info('Benchmarking index') st = time.perf_counter() f.index( index_generator(num_docs=targets['index']['data'].shape[0], target=targets), show_progress=True, ) index_time = time.perf_counter() - st log.info( 'Indexed %d docs within %d seconds', targets['index']['data'].shape[0], index_time, ) # do query log.info('Benchmarking query') st = time.perf_counter() f.search( query_generator(num_docs=args.num_query, target=targets), shuffle=True, parameters={'top_k': args.top_k}, show_progress=True, ) query_time = time.perf_counter() - st log.info('%d query within %d seconds', args.num_query, query_time) except Exception as e: log.error(e) sys.exit(1) return { 'index_time': index_time, 'query_time': query_time, 'index_qps': targets['index']['data'].shape[0] / index_time, 'query_qps': args.num_query / query_time, }