def test_flow_with_jump(self): f = (Flow().add(name='r1', yaml_path='_forward').add( name='r2', yaml_path='_forward').add( name='r3', yaml_path='_forward', needs='r1').add( name='r4', yaml_path='_forward', needs='r2').add(name='r5', yaml_path='_forward', needs='r3').add( name='r6', yaml_path='_forward', needs='r4').add( name='r8', yaml_path='_forward', needs='r6').add( name='r9', yaml_path='_forward', needs='r5').add( name='r10', yaml_path='_merge', needs=['r9', 'r8'])) with f: f.dry_run() f.save_config('tmp.yml') Flow.load_config('tmp.yml') with Flow.load_config('tmp.yml') as fl: fl.dry_run() self.add_tmpfile('tmp.yml')
def test_indexer_with_ref_indexer_compound_move(random_workspace_move, parallel, index_docs, mocker, uses_no_docker): top_k = 10 with Flow.load_config(os.path.join(cur_dir, 'compound-index.yml')) as index_flow: index_flow.index(input_fn=index_docs, request_size=10) mock = mocker.Mock() shutil.copytree(os.environ['JINA_TEST_INDEXER_WITH_REF_INDEXER'], os.environ['JINA_TEST_INDEXER_WITH_REF_INDEXER_QUERY']) shutil.rmtree(os.environ['JINA_TEST_INDEXER_WITH_REF_INDEXER']) def validate_response(resp): assert len(resp.search.docs) == 1 assert len(resp.search.docs[0].matches) == top_k query_document = Document() query_document.embedding = np.array([1, 1]) with Flow.load_config(os.path.join(cur_dir, 'compound-query.yml')) as query_flow: query_flow.search(input_fn=[query_document], on_done=mock, top_k=top_k) mock.assert_called_once() validate_callback(mock, validate_response)
def main(task, num_docs): config() if task == 'index': workspace = os.environ['JINA_WORKSPACE'] if os.path.exists(workspace): print( f'\n +---------------------------------------------------------------------------------+ \ \n | ������ | \ \n | The directory {workspace} already exists. Please remove it before indexing again. | \ \n | ������ | \ \n +---------------------------------------------------------------------------------+' ) sys.exit(1) f = Flow.load_config('flows/index.yml') with f: with TimeContext(f'QPS: indexing {num_docs}', logger=f.logger): f.index_files('data/*.wav', batch_size=2, size=num_docs) elif task == 'query': f = Flow.load_config('flows/query.yml') with f: # no perf measurement here, as it opens the REST API and blocks f.block() elif task == 'dryrun': f = Flow.load_config('flows/query.yml') with f: pass else: raise NotImplementedError( f'unknown task: {task}. A valid task is either `index` or `query` or `dryrun`.' )
def test_delete_kv(config, mocker, has_content): flow_file = 'flow_kv.yml' def validate_result_factory(num_matches): def validate_results(resp): mock() assert len(resp.docs) == num_matches return validate_results with Flow.load_config(flow_file) as index_flow: index_flow.index(input_fn=random_docs(0, 10)) validate_index_size(10) mock = mocker.Mock() with Flow.load_config(flow_file) as search_flow: search_flow.search(input_fn=chain(random_docs(2, 5), random_docs(100, 120)), output_fn=validate_result_factory(3)) mock.assert_called_once() with Flow.load_config(flow_file) as index_flow: index_flow.delete(input_fn=random_docs(0, 3, has_content=has_content)) validate_index_size(7) mock = mocker.Mock() with Flow.load_config(flow_file) as search_flow: search_flow.search(input_fn=random_docs(2, 4), output_fn=validate_result_factory(1)) mock.assert_called_once()
def main(task, return_image, data_path, num_docs, batch_size, overwrite_workspace): config() if task == 'index': workspace = os.environ['WORKDIR'] if os.path.exists(workspace): print( f'\n +---------------------------------------------------------------------------------+ \ \n | ������ | \ \n | The directory {workspace} already exists. Please remove it before indexing again. | \ \n | ������ | \ \n +---------------------------------------------------------------------------------+' ) if overwrite_workspace: clean_workdir() f = Flow.load_config('flow-index.yml') with f: f.index_files(data_path, batch_size=batch_size, read_mode='rb', size=num_docs) elif task == 'query': f = Flow.load_config(f'flow-query-{return_image}.yml') with f: f.block()
def test_delete_vector(config, mocker, flow_file, has_content): NUMBER_OF_SEARCHES = 5 def validate_result_factory(num_matches): def validate_results(resp): mock() assert len(resp.docs) == NUMBER_OF_SEARCHES for doc in resp.docs: assert len(doc.matches) == num_matches return validate_results with Flow.load_config(flow_file) as index_flow: index_flow.index(input_fn=random_docs(0, 10)) validate_index_size(10) mock = mocker.Mock() with Flow.load_config(flow_file) as search_flow: search_flow.search(input_fn=random_docs(0, NUMBER_OF_SEARCHES), output_fn=validate_result_factory(9)) mock.assert_called_once() with Flow.load_config(flow_file) as index_flow: index_flow.delete(input_fn=random_docs(0, 10, has_content=has_content)) validate_index_size(0) mock = mocker.Mock() with Flow.load_config(flow_file) as search_flow: search_flow.search(input_fn=random_docs(0, NUMBER_OF_SEARCHES), output_fn=validate_result_factory(0)) mock.assert_called_once()
def test_update_kv(config, mocker): flow_file = 'flow_kv.yml' NUMBER_OF_SEARCHES = 1 docs_before = list(random_docs(0, 10)) docs_updated = list(random_docs(0, 10)) def validate_results(resp): mock() assert len(resp.docs) == NUMBER_OF_SEARCHES with Flow.load_config(flow_file) as index_flow: index_flow.index(input_fn=docs_before) validate_index_size(10) mock = mocker.Mock() with Flow.load_config(flow_file) as search_flow: search_docs = list(random_docs(0, NUMBER_OF_SEARCHES)) search_flow.search(input_fn=search_docs, output_fn=validate_results) mock.assert_called_once() with Flow.load_config(flow_file) as index_flow: index_flow.update(input_fn=docs_updated) validate_index_size(10) mock = mocker.Mock() with Flow.load_config(flow_file) as search_flow: search_flow.search(input_fn=random_docs(0, NUMBER_OF_SEARCHES), output_fn=validate_results) mock.assert_called_once()
def test_delete_kv(config, mocker, as_string): flow_file = 'flow_kv.yml' def validate_result_factory(num_matches): def validate_results(resp): assert len(resp.docs) == num_matches return validate_results with Flow.load_config(flow_file) as index_flow: index_flow.index(inputs=random_docs(0, 10)) validate_index_size(10) mock = mocker.Mock() with Flow.load_config(flow_file) as search_flow: search_flow.search(inputs=chain(random_docs(2, 5), random_docs(100, 120)), on_done=mock) mock.assert_called_once() validate_callback(mock, validate_result_factory(3)) with Flow.load_config(flow_file) as index_flow: index_flow.delete(ids=get_ids_to_delete(0, 3, as_string)) validate_index_size(7) mock = mocker.Mock() with Flow.load_config(flow_file) as search_flow: search_flow.search(inputs=random_docs(2, 4), on_done=mock) mock.assert_called_once() validate_callback(mock, validate_result_factory(1))
def test_flow_identical(self): with open(os.path.join(cur_dir, 'yaml/test-flow.yml')) as fp: a = Flow.load_config(fp) b = (Flow().add(name='chunk_seg', replicas=3).add(name='wqncode1', replicas=2).add( name='encode2', replicas=2, needs='chunk_seg').join(['wqncode1', 'encode2'])) a.save_config('test2.yml') c = Flow.load_config('test2.yml') self.assertEqual(a, b) self.assertEqual(a, c) self.add_tmpfile('test2.yml') with a as f: self.assertEqual(f._pod_nodes['gateway'].head_args.socket_in, SocketType.PULL_CONNECT) self.assertEqual(f._pod_nodes['gateway'].tail_args.socket_out, SocketType.PUSH_CONNECT) self.assertEqual(f._pod_nodes['chunk_seg'].head_args.socket_in, SocketType.PULL_BIND) self.assertEqual(f._pod_nodes['chunk_seg'].head_args.socket_out, SocketType.ROUTER_BIND) for arg in f._pod_nodes['chunk_seg'].peas_args['peas']: self.assertEqual(arg.socket_in, SocketType.DEALER_CONNECT) self.assertEqual(arg.socket_out, SocketType.PUSH_CONNECT) self.assertEqual(f._pod_nodes['chunk_seg'].tail_args.socket_in, SocketType.PULL_BIND) self.assertEqual(f._pod_nodes['chunk_seg'].tail_args.socket_out, SocketType.PUB_BIND) self.assertEqual(f._pod_nodes['wqncode1'].head_args.socket_in, SocketType.SUB_CONNECT) self.assertEqual(f._pod_nodes['wqncode1'].head_args.socket_out, SocketType.ROUTER_BIND) for arg in f._pod_nodes['wqncode1'].peas_args['peas']: self.assertEqual(arg.socket_in, SocketType.DEALER_CONNECT) self.assertEqual(arg.socket_out, SocketType.PUSH_CONNECT) self.assertEqual(f._pod_nodes['wqncode1'].tail_args.socket_in, SocketType.PULL_BIND) self.assertEqual(f._pod_nodes['wqncode1'].tail_args.socket_out, SocketType.PUSH_CONNECT) self.assertEqual(f._pod_nodes['encode2'].head_args.socket_in, SocketType.SUB_CONNECT) self.assertEqual(f._pod_nodes['encode2'].head_args.socket_out, SocketType.ROUTER_BIND) for arg in f._pod_nodes['encode2'].peas_args['peas']: self.assertEqual(arg.socket_in, SocketType.DEALER_CONNECT) self.assertEqual(arg.socket_out, SocketType.PUSH_CONNECT) self.assertEqual(f._pod_nodes['encode2'].tail_args.socket_in, SocketType.PULL_BIND) self.assertEqual(f._pod_nodes['encode2'].tail_args.socket_out, SocketType.PUSH_CONNECT)
def test_dimensionality_search_wrong(tmp_path, mocker): """will fail because search docs have diff shape in embedding""" config_environ(path=tmp_path) flow_file = 'flow.yml' flow_query_file = 'flow.yml' docs = list(random_docs_with_shapes(NR_DOCS_INDEX, START_SHAPE)) docs_update = list(random_docs_with_shapes(NR_DOCS_INDEX, INDEX2_SHAPE, start=len(docs) + 1)) all_docs_indexed = docs.copy() all_docs_indexed.extend(docs_update) docs_search = list( random_docs_with_shapes( NUMBER_OF_SEARCHES, INDEX2_SHAPE, start=len(docs) + len(docs_update) + 1) ) f_index = Flow.load_config(flow_file) f_query = Flow.load_config(flow_query_file) def validate_result_factory(num_matches): def validate_results(resp): mock() assert len(resp.docs) == NUMBER_OF_SEARCHES for doc in resp.docs: assert len(doc.matches) == num_matches return validate_results with f_index: f_index.index(input_fn=docs) validate_index_size(NR_DOCS_INDEX, expected_indices=2) mock = mocker.Mock() with f_query: f_query.search(input_fn=docs_search, # 0 because search docs have wrong shape on_done=validate_result_factory(0)) mock.assert_called_once() # this won't increase the index size as the ids are new with f_index: f_index.update(input_fn=docs_update) validate_index_size(NR_DOCS_INDEX, expected_indices=2) mock = mocker.Mock() with f_query: f_query.search(input_fn=docs_search, # 0 because search docs have wrong shape on_done=validate_result_factory(0)) mock.assert_called_once() with f_index: f_index.delete(ids=[d.id for d in all_docs_indexed]) validate_index_size(0, expected_indices=2) mock = mocker.Mock() with f_query: f_query.search(input_fn=docs_search, on_done=validate_result_factory(0)) mock.assert_called_once()
def test_update_vector(config, mocker, flow_file): num_searches = 10 num_docs = 10 num_chunks = 5 docs_before = list( document_generator(start=0, num_docs=num_docs, num_chunks=num_chunks)) docs_updated = list( document_generator(start=10, num_docs=20, num_chunks=num_chunks)) ids_before = list() ids_updated = list() def validate_result_factory(has_changed, num_matches): def validate_results(resp): assert len(resp.docs) == num_searches for d in docs_before: ids_before.append(d.id) for d in docs_updated: ids_updated.append(d.id) for doc in resp.docs: assert len(doc.matches) == num_matches if has_changed: assert doc.id in ids_updated assert doc.id not in ids_before else: assert doc.id in ids_before assert doc.id not in ids_updated return validate_results with Flow.load_config(flow_file) as index_flow: index_flow.index(input_fn=docs_before) validate_index_size( num_chunks * num_docs) # num_docs per all its chunks, 50 in this case mock = mocker.Mock() with Flow.load_config(flow_file) as search_flow: search_flow.search(input_fn=document_generator(start=0, num_docs=num_docs, num_chunks=num_chunks), on_done=mock) mock.assert_called_once() validate_callback( mock, validate_result_factory(has_changed=False, num_matches=TOP_K)) with Flow.load_config(flow_file) as index_flow: index_flow.update(input_fn=docs_updated) validate_index_size( num_chunks * num_docs) # num_docs per all its chunks, 50 in this case mock = mocker.Mock() with Flow.load_config(flow_file) as search_flow: search_flow.search(input_fn=document_generator(start=10, num_docs=20, num_chunks=num_chunks), on_done=mock) mock.assert_called_once() validate_callback( mock, validate_result_factory(has_changed=True, num_matches=num_docs))
def test_wrong_mime_type(tmp_path, mocker): """we assign text to .text, 'image/jpeg' to .mime_type""" config_environ(path=tmp_path) flow_file = 'flow-parallel.yml' flow_query_file = 'flow.yml' docs = list(random_docs_image_mime_text_content(NR_DOCS_INDEX)) docs_update = list( random_docs_image_mime_text_content(NR_DOCS_INDEX, start=len(docs) + 1)) all_docs_indexed = docs.copy() all_docs_indexed.extend(docs_update) docs_search = list( random_docs_image_mime_text_content(NUMBER_OF_SEARCHES, start=len(docs) + len(docs_update) + 1)) f_index = Flow.load_config(flow_file) f_query = Flow.load_config(flow_query_file) def validate_result_factory(num_matches): def validate_results(resp): mock() assert len(resp.docs) == NUMBER_OF_SEARCHES for doc in resp.docs: assert len(doc.matches) == num_matches for m in doc.matches: assert m.mime_type == 'text/plain' return validate_results with f_index: f_index.index(input_fn=docs) validate_index_size(NR_DOCS_INDEX, expected_indices=2) mock = mocker.Mock() with f_query: f_query.search(input_fn=docs_search, on_done=validate_result_factory(TOPK)) mock.assert_called_once() # this won't increase the index size as the ids are new with f_index: f_index.update(input_fn=docs_update) validate_index_size(NR_DOCS_INDEX, expected_indices=2) mock = mocker.Mock() with f_query: f_query.search(input_fn=docs_search, on_done=validate_result_factory(TOPK)) mock.assert_called_once() with f_index: f_index.delete(ids=[d.id for d in all_docs_indexed]) validate_index_size(0, expected_indices=2) mock = mocker.Mock() with f_query: f_query.search(input_fn=docs_search, on_done=validate_result_factory(0)) mock.assert_called_once()
def test_index_depth_0_search_depth_1(tmpdir, mocker): os.environ['JINA_TEST_LEVEL_DEPTH_WORKSPACE'] = str(tmpdir) index_data = [ 'I am chunk 0 of doc 1, I am chunk 1 of doc 1, I am chunk 2 of doc 1', 'I am chunk 0 of doc 2, I am chunk 1 of doc 2', 'I am chunk 0 of doc 3, I am chunk 1 of doc 3, I am chunk 2 of doc 3, I am chunk 3 of doc 3', ] index_flow = Flow.load_config('flow-index.yml') with index_flow: index_flow.index(index_data) mock = mocker.Mock() def validate_granularity_1(resp): mock() assert len(resp.docs) == 3 for doc in resp.docs: assert doc.granularity == 0 assert len(doc.matches) == 3 assert doc.matches[0].granularity == 0 assert resp.docs[0].text == ' I am chunk 1 of doc 1,' assert ( resp.docs[0].matches[0].text == 'I am chunk 0 of doc 1, I am chunk 1 of doc 1, I am chunk 2 of doc 1' ) assert resp.docs[1].text == 'I am chunk 0 of doc 2,' assert ( resp.docs[1].matches[0].text == 'I am chunk 0 of doc 2, I am chunk 1 of doc 2' ) assert resp.docs[2].text == ' I am chunk 3 of doc 3' assert ( resp.docs[2].matches[0].text == 'I am chunk 0 of doc 3, I am chunk 1 of doc 3, I am chunk 2 of doc 3, I am chunk 3 of doc 3' ) search_data = [ ' I am chunk 1 of doc 1,', 'I am chunk 0 of doc 2,', ' I am chunk 3 of doc 3', ] with Flow.load_config('flow-query.yml') as search_flow: search_flow.search( input_fn=search_data, on_done=validate_granularity_1, callback_on='body', ) del os.environ['JINA_TEST_LEVEL_DEPTH_WORKSPACE'] mock.assert_called_once()
def test_flow_with_jump(): def _validate(f): node = f._pod_nodes['gateway'] assert node.head_args.socket_in == SocketType.PULL_CONNECT assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r1'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUB_BIND node = f._pod_nodes['r2'] assert node.head_args.socket_in == SocketType.SUB_CONNECT assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r3'] assert node.head_args.socket_in == SocketType.SUB_CONNECT assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r4'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r5'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r6'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r8'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r9'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['r10'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_BIND for name, node in f._pod_nodes.items(): assert node.peas_args['peas'][0] == node.head_args assert node.peas_args['peas'][0] == node.tail_args f = (Flow().add(name='r1').add(name='r2').add(name='r3', needs='r1').add( name='r4', needs='r2').add(name='r5', needs='r3').add(name='r6', needs='r4').add( name='r8', needs='r6').add(name='r9', needs='r5').add(name='r10', needs=['r9', 'r8'])) with f: _validate(f) f.save_config('tmp.yml') Flow.load_config('tmp.yml') with Flow.load_config('tmp.yml') as f: _validate(f) rm_files(['tmp.yml'])
def test_topk_override(config): # Making queryset top_k_queryset = QueryLang( VectorSearchDriver(top_k=int(os.environ['JINA_TOPK_OVERRIDE']), priority=1)) with Flow.load_config('flow.yml') as index_flow: index_flow.index(input_fn=random_docs(100)) with Flow.load_config('flow.yml') as search_flow: search_flow.search(input_fn=random_docs(int(os.environ['JINA_NDOCS'])), output_fn=validate_override_results, queryset=[top_k_queryset])
def test_index_depth_0_search_depth_1(tmpdir, mocker, monkeypatch, restful): monkeypatch.setenv("RESTFUL", restful) monkeypatch.setenv("JINA_TEST_LEVEL_DEPTH_WORKSPACE", str(tmpdir)) index_data = [ 'I am chunk 0 of doc 1, I am chunk 1 of doc 1, I am chunk 2 of doc 1', 'I am chunk 0 of doc 2, I am chunk 1 of doc 2', 'I am chunk 0 of doc 3, I am chunk 1 of doc 3, I am chunk 2 of doc 3, I am chunk 3 of doc 3', ] index_flow = Flow.load_config('flow-index.yml') with index_flow: index_flow.index(index_data) mock = mocker.Mock() def validate_granularity_1(resp): mock() assert len(resp.docs) == 3 for doc in resp.docs: assert doc.granularity == 0 assert len(doc.matches) == 3 assert doc.matches[0].granularity == 0 assert resp.docs[0].text == ' I am chunk 1 of doc 1,' assert ( resp.docs[0].matches[0].text == 'I am chunk 0 of doc 1, I am chunk 1 of doc 1, I am chunk 2 of doc 1' ) assert resp.docs[1].text == 'I am chunk 0 of doc 2,' assert (resp.docs[1].matches[0].text == 'I am chunk 0 of doc 2, I am chunk 1 of doc 2') assert resp.docs[2].text == ' I am chunk 3 of doc 3' assert ( resp.docs[2].matches[0].text == 'I am chunk 0 of doc 3, I am chunk 1 of doc 3, I am chunk 2 of doc 3, I am chunk 3 of doc 3' ) search_data = [ ' I am chunk 1 of doc 1,', 'I am chunk 0 of doc 2,', ' I am chunk 3 of doc 3', ] with Flow.load_config('flow-query.yml') as search_flow: search_flow.search(input_fn=search_data, on_done=validate_granularity_1, on_error=lambda r: print(r)) mock.assert_called_once()
def test_delete_vector(config, mocker, flow_file): num_searches = 10 num_docs = 10 num_chunks = 5 def validate_result_factory(num_matches): def validate_results(resp): assert len(resp.docs) == num_searches for doc in resp.docs: assert len(doc.matches) == num_matches return validate_results with Flow.load_config(flow_file) as index_flow: index_flow.index(inputs=document_generator( start=0, num_docs=num_docs, num_chunks=num_chunks)) validate_index_size(num_chunks * num_docs) # 5 chunks for each of the 10 docs mock = mocker.Mock() with Flow.load_config(flow_file) as search_flow: search_flow.search( inputs=document_generator(start=0, num_docs=num_docs, num_chunks=num_chunks), on_done=mock, ) mock.assert_called_once() validate_callback(mock, validate_result_factory(TOP_K)) delete_ids = [] for d in document_generator(start=0, num_docs=num_docs, num_chunks=num_chunks): delete_ids.append(d.id) for c in d.chunks: delete_ids.append(c.id) with Flow.load_config(flow_file) as index_flow: index_flow.delete(ids=delete_ids) validate_index_size(0) mock = mocker.Mock() with Flow.load_config(flow_file) as search_flow: search_flow.search( inputs=document_generator(start=0, num_docs=num_docs, num_chunks=num_chunks), on_done=mock, ) mock.assert_called_once() validate_callback(mock, validate_result_factory(0))
def run(task, top_k, indexer_query_type): general_config() query_config(indexer_query_type) request_size = int(os.environ['JINA_REQUEST_SIZE']) dataset_name = os.environ['JINA_DATASET_NAME'] data_dir = os.path.join(dataset_name, os.environ['JINA_TMP_DATA_DIR']) if task == 'index': data_path = os.path.join(data_dir, f'{dataset_name}_base.fvecs') data_func = index_generator(data_path) data_func_list = list(data_func) with Flow.load_config('flow-index.yml') as flow: with TimeContext(f'QPS: indexing {len(list(data_func_list))}', logger=flow.logger): flow.index(inputs=data_func_list, request_size=request_size) elif task == 'query': evaluation_results = defaultdict(float) def _get_evaluation_results(evaluation_results: dict, resp): for d in resp.search.docs: for eval in d.evaluations: evaluation_results[eval.op_name] = eval.value get_evaluation_results = partial(_get_evaluation_results, evaluation_results) data_path = os.path.join(data_dir, f'{dataset_name}_query.fvecs') groundtruth_path = os.path.join(data_dir, f'{dataset_name}_groundtruth.ivecs') query_input = list(evaluate_generator(data_path, groundtruth_path)) with Flow.load_config('flow-query.yml') as flow: with TimeContext(f'QPS: query with {len(query_input)}', logger=flow.logger): flow.search(inputs=query_input, request_size=request_size, on_done=get_evaluation_results, top_k=top_k) logger.info(f'evaluation: {list(evaluation_results)}') evaluation = evaluation_results[list(evaluation_results.keys())[0]] # return for test logger.info(f'Recall@{top_k} ==> {100 * evaluation}') return 100 * evaluation else: raise NotImplementedError( f'unknown task: {task}. A valid task is either `index` or `query`.' )
def test_flow_identical(): with open(os.path.join(cur_dir, '../yaml/test-flow.yml')) as fp: a = Flow.load_config(fp) b = (Flow() .add(name='chunk_seg', parallel=3) .add(name='wqncode1', parallel=2) .add(name='encode2', parallel=2, needs='chunk_seg') .join(['wqncode1', 'encode2'])) a.save_config('test2.yml') c = Flow.load_config('test2.yml') assert a == b assert a == c with a as f: node = f._pod_nodes['gateway'] assert node.head_args.socket_in == SocketType.PULL_CONNECT assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['chunk_seg'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.head_args.socket_out == SocketType.ROUTER_BIND for arg in node.peas_args['peas']: assert arg.socket_in == SocketType.DEALER_CONNECT assert arg.socket_out == SocketType.PUSH_CONNECT assert node.tail_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUB_BIND node = f._pod_nodes['wqncode1'] assert node.head_args.socket_in == SocketType.SUB_CONNECT assert node.head_args.socket_out == SocketType.ROUTER_BIND for arg in node.peas_args['peas']: assert arg.socket_in == SocketType.DEALER_CONNECT assert arg.socket_out == SocketType.PUSH_CONNECT assert node.tail_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['encode2'] assert node.head_args.socket_in == SocketType.SUB_CONNECT assert node.head_args.socket_out == SocketType.ROUTER_BIND for arg in node.peas_args['peas']: assert arg.socket_in == SocketType.DEALER_CONNECT assert arg.socket_out == SocketType.PUSH_CONNECT assert node.tail_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT rm_files(['test2.yml'])
def main(task, data_path, num_docs, batch_size, image_path, text_query, overwrite_workspace): config() image_paths = [image_path] text_queries = [text_query] if task == 'index': if overwrite_workspace: clean_workdir() f = Flow.load_config('flow-index.yml') with f: f.index(index_generator(data_path, num_docs), batch_size=batch_size) elif task == 'query': f = Flow.load_config('flow-query.yml') with f: f.search(input_fn=query_generator(image_paths, text_queries), on_done=print_result)
def test_update_vector(config, mocker, flow_file): NUMBER_OF_SEARCHES = 1 docs_before = list(random_docs(0, 10)) docs_updated = list(random_docs(0, 10)) def validate_result_factory(has_changed): def validate_results(resp): mock() assert len(resp.docs) == NUMBER_OF_SEARCHES hash_set_before = [ hash(d.embedding.tobytes()) for d in docs_before ] hash_set_updated = [ hash(d.embedding.tobytes()) for d in docs_updated ] for doc in resp.docs: assert len(doc.matches) == 9 for match in doc.matches: h = hash(match.embedding.tobytes()) if has_changed: assert h not in hash_set_before assert h in hash_set_updated else: assert h in hash_set_before assert h not in hash_set_updated return validate_results with Flow.load_config(flow_file) as index_flow: index_flow.index(input_fn=docs_before) validate_index_size(10) mock = mocker.Mock() with Flow.load_config(flow_file) as search_flow: search_docs = list(random_docs(0, NUMBER_OF_SEARCHES)) search_flow.search( input_fn=search_docs, output_fn=validate_result_factory(has_changed=False)) mock.assert_called_once() with Flow.load_config(flow_file) as index_flow: index_flow.update(input_fn=docs_updated) validate_index_size(10) mock = mocker.Mock() with Flow.load_config(flow_file) as search_flow: search_flow.search(input_fn=random_docs(0, NUMBER_OF_SEARCHES), output_fn=validate_result_factory(has_changed=True)) mock.assert_called_once()
def test_helloworld_flow_dry_run(tmpdir): args = set_hw_parser().parse_args([]) os.environ['RESOURCE_DIR'] = resource_filename('jina', 'resources') os.environ['SHARDS'] = str(args.shards) os.environ['PARALLEL'] = str(args.parallel) os.environ['HW_WORKDIR'] = str(tmpdir) # run it! with Flow.load_config(resource_filename('jina', '/'.join(('resources', 'helloworld.flow.index.yml')))): pass # run it! with Flow.load_config(resource_filename('jina', '/'.join(('resources', 'helloworld.flow.query.yml')))): pass
def test_helloworld_flow(tmpdir): args = set_hw_parser().parse_args([]) os.environ['RESOURCE_DIR'] = resource_filename('jina', 'resources') os.environ['SHARDS'] = str(args.shards) os.environ['PARALLEL'] = str(args.parallel) os.environ['HW_WORKDIR'] = str(tmpdir) f = Flow.load_config(resource_filename('jina', '/'.join(('resources', 'helloworld.flow.index.yml')))) targets = { 'index': { 'url': args.index_data_url, 'filename': os.path.join(tmpdir, 'index-original') }, 'query': { 'url': args.query_data_url, 'filename': os.path.join(tmpdir, 'query-original') } } # download the data Path(tmpdir).mkdir(parents=True, exist_ok=True) download_data(targets) # run it! with f: f.index(_input_ndarray(targets['index']['data']), request_size=args.index_request_size)
def test_helloworld_flow(tmpdir): args = set_hw_parser().parse_args([]) os.environ['RESOURCE_DIR'] = resource_filename('jina', 'resources') os.environ['SHARDS'] = str(args.shards) os.environ['PARALLEL'] = str(args.parallel) os.environ['HW_WORKDIR'] = str(tmpdir) os.environ['WITH_LOGSERVER'] = str(args.logserver) f = Flow.load_config( resource_filename('jina', '/'.join( ('resources', 'helloworld.flow.index.yml')))) targets = { 'index': { 'url': args.index_data_url, 'filename': os.path.join(tmpdir, 'index-original') }, 'query': { 'url': args.query_data_url, 'filename': os.path.join(tmpdir, 'query-original') } } # download the data Path(tmpdir).mkdir(parents=True, exist_ok=True) download_data(targets) # run it! with f: py_client( host=f.host, port_expose=f.port_expose, ).index(input_numpy(targets['index']['data']), batch_size=args.index_batch_size)
def test_flow_log_server(self): f = Flow.load_config('../yaml/test_log_server.yml') with f: self.assertTrue(hasattr(JINA_GLOBAL.logserver, 'ready')) # Ready endpoint a = requests.get(JINA_GLOBAL.logserver.address + '/status/ready', timeout=5) assert a.status_code == 200 # YAML endpoint a = requests.get(JINA_GLOBAL.logserver.address + '/data/yaml', timeout=5) self.assertTrue(a.text.startswith('!Flow')) assert a.status_code == 200 # Pod endpoint a = requests.get(JINA_GLOBAL.logserver.address + '/data/api/pod', timeout=5) self.assertTrue('pod' in a.json()) assert a.status_code == 200 # Shutdown endpoint a = requests.get(JINA_GLOBAL.logserver.address + '/action/shutdown', timeout=5) assert a.status_code == 200 # Check ready endpoint after shutdown, check if server stopped with self.assertRaises(requests.exceptions.ConnectionError): a = requests.get(JINA_GLOBAL.logserver.address + '/status/ready', timeout=5)
def test_load_flow_from_yaml(self): with open(os.path.join(cur_dir, '../yaml/test-flow.yml')) as fp: a = Flow.load_config(fp) with open(os.path.join(cur_dir, '../yaml/swarm-out.yml'), 'w') as fp, a: a.to_swarm_yaml(fp) self.add_tmpfile(os.path.join(cur_dir, '../yaml/swarm-out.yml'))
def test_flow_identical(self): with open('yaml/test-flow.yml') as fp: a = Flow.load_config(fp) b = (Flow().add(name='chunk_seg', replicas=3).add(name='wqncode1', replicas=2).add( name='encode2', replicas=2, needs='chunk_seg').join(['wqncode1', 'encode2'])) a.save_config('test2.yml') c = Flow.load_config('test2.yml') self.assertEqual(a, b) self.assertEqual(a, c) self.add_tmpfile('test2.yml')
def test_flow_with_pod_envs(): f = Flow.load_config('yaml/flow-with-envs.yml') class EnvChecker1(BaseExecutor): """Class used in Flow YAML""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # pea/pod-specific assert os.environ['key1'] == 'value1' assert os.environ['key2'] == 'value2' # inherit from parent process assert os.environ['key_parent'] == 'value3' class EnvChecker2(BaseExecutor): """Class used in Flow YAML""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # pea/pod-specific assert 'key1' not in os.environ assert 'key2' not in os.environ # inherit from parent process assert os.environ['key_parent'] == 'value3' with f: pass
def test_flow_log_server(): f = Flow.load_config(str(cur_dir.parent / 'yaml' / 'test_log_server.yml')) with f: assert hasattr(JINA_GLOBAL.logserver, 'ready') # Ready endpoint a = requests.get(JINA_GLOBAL.logserver.address + '/status/ready', timeout=5) assert a.status_code == 200 # YAML endpoint a = requests.get(JINA_GLOBAL.logserver.address + '/data/yaml', timeout=5) assert a.text.startswith('!Flow') assert a.status_code == 200 # Pod endpoint a = requests.get(JINA_GLOBAL.logserver.address + '/data/api/pod', timeout=5) assert 'pod' in a.json() assert a.status_code == 200 # Shutdown endpoint a = requests.get(JINA_GLOBAL.logserver.address + '/action/shutdown', timeout=5) assert a.status_code == 200 # Check ready endpoint after shutdown, check if server stopped with pytest.raises((requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout)): requests.get(JINA_GLOBAL.logserver.address + '/status/ready', timeout=5)
def test_load_flow_from_yaml(): with open(os.path.join(cur_dir, '../yaml/test-flow.yml')) as fp: a = Flow.load_config(fp) with open(os.path.join(cur_dir, '../yaml/swarm-out.yml'), 'w') as fp, a: a.to_swarm_yaml(fp) rm_files([os.path.join(cur_dir, '../yaml/swarm-out.yml')])