def test_flow_yaml_dump(): f = Flow(optimize_level=FlowOptimizeLevel.IGNORE_GATEWAY, no_gateway=True) f.save_config('test1.yml') fl = Flow.load_config('test1.yml') assert f.args.optimize_level == fl.args.optimize_level rm_files(['test1.yml'])
def test_query_multi_modal_text(): f = Flow.load_config('flows/query.yml') search_text = 'It makes sense to first define what we mean by multimodality before going into more fancy terms.' doc = Document(text=search_text) with f: f.post('/search', inputs=doc, on_done=assert_result)
def test_query_multi_modal_image(): f = Flow.load_config('flows/query.yml') with f: f.post('/search', inputs=search_generator(data_path='toy_data/photo-1.png'), read_mode='r', on_done=assert_result)
def test_flow_with_pod_envs(): f = Flow.load_config('yaml/flow-with-envs.yml') class EnvChecker1(BaseExecutor): """Class used in Flow YAML""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # pea/pod-specific assert os.environ['key1'] == 'value1' assert os.environ['key2'] == 'value2' # inherit from parent process assert os.environ['key_parent'] == 'value3' class EnvChecker2(BaseExecutor): """Class used in Flow YAML""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # pea/pod-specific assert 'key1' not in os.environ assert 'key2' not in os.environ # inherit from parent process assert os.environ['key_parent'] == 'value3' with f: pass
def test_query_text(tmpdir_factory): def assert_result(response): docs = response.docs # check number of results assert len(docs) == 1 assert len(docs[0].chunks) == 2 parent_docs = docs[0].matches parent_ids = parent_docs.get_attributes('id') assert len(parent_docs) > 0 for chunk in docs[0].chunks: assert len(chunk.matches) == 5 # top_k = 5 match_ids = chunk.matches.get_attributes('id') assert len(match_ids) == len(list(set(match_ids))) for match in chunk.matches: assert match.text is not None assert match.location is not None assert match.parent_id in parent_ids assert match.text in parent_docs[parent_ids.index( match.parent_id)].text flow = Flow.load_config('flows/query.yml') with flow: search_text = 'looked through every window then. hello world.' doc = Document(content=search_text, mime_type='text/plain') response = flow.post('/search', inputs=doc, parameters={'top_k': 5}, return_results=True) assert_result(response[0])
def test_flow_identity_override(): f = Flow().add().add(shards=2).add(shards=2) with f: assert len(set(p.args.identity for _, p in f)) == f.num_pods f = Flow(identity='123456').add().add(shards=2).add(shards=2) with f: assert len(set(p.args.identity for _, p in f)) == 1 y = ''' !Flow version: '1.0' executors: - name: hello - name: world shards: 3 ''' f = Flow.load_config(y) for _, p in f: p.args.identity = '1234' with f: assert len(set(p.args.identity for _, p in f)) == 2 for _, p in f: if p.args.identity != '1234': assert p.name == 'gateway'
def main(task, num_docs): config() if task == 'index': workspace = os.environ['JINA_WORKSPACE'] if os.path.exists(workspace): print( f'\n +---------------------------------------------------------------------------------+ \ \n | ������ | \ \n | The directory {workspace} already exists. Please remove it before indexing again. | \ \n | ������ | \ \n +---------------------------------------------------------------------------------+' ) sys.exit(1) pdf_files = glob.glob(os.path.join(PDF_DATA_PATH, '*.pdf')) index(pdf_files[:num_docs]) if task == 'query': query() if task == 'query_text': query_text() if task == 'query_image': query_image() if task == 'query_pdf': query_pdf() if task == 'query_restful': f = Flow.load_config('flows/query-multimodal.yml') with f: f.block() if task == "dryrun": dryrun()
def test_dump(tmpdir, nr_docs, emb_size, shards): docs = list(get_documents(nr=nr_docs, index_start=0, emb_size=emb_size)) assert len(docs) == nr_docs dump_path = os.path.join(str(tmpdir), 'dump_dir') os.environ['DBMS_WORKSPACE'] = os.path.join(str(tmpdir), 'index_ws') print('DBMS_WORKSPACE ', os.environ['DBMS_WORKSPACE']) with Flow.load_config('flow_dbms.yml') as flow_dbms: with TimeContext(f'### indexing {len(docs)} docs'): flow_dbms.index(docs) with TimeContext(f'### dumping {len(docs)} docs'): flow_dbms.dump('indexer_dbms', dump_path, shards=shards, timeout=-1) dir_size = path_size(dump_path) print(f'### dump path size: {dir_size} MBs') with BaseExecutor.load(os.path.join(os.environ['DBMS_WORKSPACE'], 'psql-0', 'psql.bin')) as idx: assert idx.size == nr_docs # assert data dumped is correct for pea_id in range(shards): assert_dump_data(dump_path, docs, shards, pea_id) # required to pass next tests with BaseExecutor.load(os.path.join(os.environ['DBMS_WORKSPACE'], 'psql-0', 'psql.bin')) as idx: idx.delete([d.id for d in docs])
def test_flow_identity_override(): f = Flow().add().add(parallel=2).add(parallel=2) with f: assert len(set(p.args.identity for _, p in f)) == f.num_pods f = Flow(identity='123456').add().add(parallel=2).add(parallel=2) with f: assert len(set(p.args.identity for _, p in f)) == 1 y = ''' !Flow version: '1.0' pods: - uses: _pass - uses: _pass parallel: 3 ''' f = Flow.load_config(y) for _, p in f: p.args.identity = '1234' with f: assert len(set(p.args.identity for _, p in f)) == 2 for _, p in f: if p.args.identity != '1234': assert p.name == 'gateway'
def test_flow_with_jump(tmpdir): f = (Flow().add(name='r1').add(name='r2').add(name='r3', needs='r1').add( name='r4', needs='r2').add(name='r5', needs='r3').add(name='r6', needs='r4').add( name='r8', needs='r6').add(name='r9', needs='r5').add(name='r10', needs=['r9', 'r8'])) with f: _validate_flow(f) f.save_config(os.path.join(str(tmpdir), 'tmp.yml')) Flow.load_config(os.path.join(str(tmpdir), 'tmp.yml')) with Flow.load_config(os.path.join(str(tmpdir), 'tmp.yml')) as f: _validate_flow(f)
def flow(request, temp_workspace): source = request.param if source == 'python': f = (Flow().add(name='first').add( uses=ConditionDumpExecutor, uses_metas={ 'name': 'exec1' }, workspace=os.environ['TEMP_WORKSPACE'], name='exec1', needs=['first'], when={ 'tags__type': { '$eq': 1 } }, ).add( uses=ConditionDumpExecutor, workspace=os.environ['TEMP_WORKSPACE'], uses_metas={ 'name': 'exec2' }, name='exec2', needs='first', when={ 'tags__type': { '$gt': 1 } }, ).needs_all('joiner')) else: f = Flow.load_config(os.path.join(cur_dir, 'flow.yml')) return f
def query_image(): f = Flow.load_config('flows/query-only-image.yml') with f: print('image search:') f.search(input_fn=search_generator(data_path='toy_data/photo-1.png'), read_mode='r', on_done=get_pdf)
def test_flow_identical(tmpdir): with open(os.path.join(cur_dir, '../yaml/test-flow.yml')) as fp: a = Flow.load_config(fp) b = (Flow().add(name='chunk_seg', parallel=3).add(name='wqncode1', parallel=2).add( name='encode2', parallel=2, needs='chunk_seg').join(['wqncode1', 'encode2'])) a.save_config(os.path.join(str(tmpdir), 'test2.yml')) c = Flow.load_config(os.path.join(str(tmpdir), 'test2.yml')) assert a == b assert a == c with a as f: node = f._pod_nodes['gateway'] assert node.head_args.socket_in == SocketType.PULL_CONNECT assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['chunk_seg'] assert node.head_args.socket_in == SocketType.PULL_BIND assert node.head_args.socket_out == SocketType.ROUTER_BIND for arg in node.peas_args['peas']: assert arg.socket_in == SocketType.DEALER_CONNECT assert arg.socket_out == SocketType.PUSH_CONNECT assert node.tail_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUB_BIND node = f._pod_nodes['wqncode1'] assert node.head_args.socket_in == SocketType.SUB_CONNECT assert node.head_args.socket_out == SocketType.ROUTER_BIND for arg in node.peas_args['peas']: assert arg.socket_in == SocketType.DEALER_CONNECT assert arg.socket_out == SocketType.PUSH_CONNECT assert node.tail_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT node = f._pod_nodes['encode2'] assert node.head_args.socket_in == SocketType.SUB_CONNECT assert node.head_args.socket_out == SocketType.ROUTER_BIND for arg in node.peas_args['peas']: assert arg.socket_in == SocketType.DEALER_CONNECT assert arg.socket_out == SocketType.PUSH_CONNECT assert node.tail_args.socket_in == SocketType.PULL_BIND assert node.tail_args.socket_out == SocketType.PUSH_CONNECT
def test_load_legacy_and_v1(): Flow.load_config('yaml/flow-legacy-syntax.yml') Flow.load_config('yaml/flow-v1-syntax.yml') # this should fallback to v1 Flow.load_config('yaml/flow-v1.0-syntax.yml') with pytest.raises(BadFlowYAMLVersion): Flow.load_config('yaml/flow-v99-syntax.yml')
def test_dump_load_build(monkeypatch): f: Flow = Flow.load_config(''' jtype: Flow with: name: abc port_expose: 12345 protocol: http executors: - name: executor1 port_in: 45678 parallel: 2 - name: executor2 uses: docker://exec host: 1.2.3.4 - name: executor3 uses: docker://exec parallel: 2 ''').build() f['gateway'].args.runs_in_docker = True f['executor1'].args.runs_in_docker = True f1: Flow = Flow.load_config(JAML.dump(f)).build() assert not f1[ 'gateway'].args.runs_in_docker # gateway doesn't have custom args set, as env was not set assert f1['executor1'].args.runs_in_docker # these were passed by the user assert f.port_expose == f1.port_expose assert f.protocol == f1.protocol assert f['executor1'].args.port_in == f1['executor1'].args.port_in assert f['executor2'].args.host == f1['executor2'].args.host # this was set during `load_config` assert f['executor2'].args.port_in == f1['executor2'].args.port_in assert f['executor3'].args.port_out == f1['executor3'].args.port_out # gateway args are not set, if `JINA_FULL_CLI` is not set assert f['gateway'].args.port_in != f1['gateway'].args.port_in assert f['gateway'].args.port_out != f1['gateway'].args.port_out monkeypatch.setenv('JINA_FULL_CLI', 'true') f2: Flow = Flow.load_config(JAML.dump(f)).build() assert f2['gateway'].args.runs_in_docker assert f2['executor1'].args.runs_in_docker # these were passed by the user assert f.port_expose == f2.port_expose # validate gateway args (set during build) assert f['gateway'].args.port_in == f2['gateway'].args.port_in assert f['gateway'].args.port_out == f2['gateway'].args.port_out assert f['gateway'].args.port_ctrl == f2['gateway'].args.port_ctrl
def add( self, args: Namespace, port_mapping: Optional[PortMappings] = None, envs: Optional[Dict] = {}, **kwargs, ) -> PartialStoreItem: """Starts a Flow in `partial-daemon`. :param args: namespace args for the flow :param port_mapping: ports to be set :param envs: environment variables to be passed into partial flow :param kwargs: keyword args :return: Item describing the Flow object """ try: if not args.uses: raise ValueError( 'uses yaml file was not specified in flow definition') elif not Path(args.uses).is_file(): raise ValueError(f'uses {args.uses} not found in workspace') self.object: Flow = Flow.load_config(args.uses).build() self.object.workspace_id = jinad_args.workspace_id self.object.workspace = __partial_workspace__ self.object.env = {'HOME': __partial_workspace__, **envs} for deployment in self.object._deployment_nodes.values(): runtime_cls = update_runtime_cls(deployment.args, copy=True).runtime_cls if port_mapping and (hasattr(deployment.args, 'replicas') and deployment.args.replicas > 1): for pod_args in [deployment.pod_args['head']]: if pod_args.name in port_mapping.pod_names: for port_name in Ports.__fields__: self._set_pod_ports(pod_args, port_mapping, port_name) deployment.update_worker_pod_args() self.object = self.object.__enter__() except Exception as e: if hasattr(self, 'object'): self.object.__exit__(type(e), e, e.__traceback__) self._logger.error(f'{e!r}') raise else: with open(args.uses) as yaml_file: yaml_source = yaml_file.read() self.item = PartialFlowItem( arguments={ 'port_expose': self.object.port_expose, 'protocol': self.object.protocol.name, **vars(self.object.args), }, yaml_source=yaml_source, ) self._logger.success(f'Flow is created successfully!') return self.item
def main(index_num_docs, evaluate_num_docs, request_size, data_set, model_name, evaluation_mode): config(model_name) if index_num_docs > 0: with Flow.load_config('flow-index.yml') as f: f.use_rest_gateway() f.index(input_fn=input_index_data(index_num_docs, request_size, data_set), request_size=request_size) with Flow.load_config('flow-query.yml').add( name='evaluator', uses='yaml/evaluate.yml') as flow_eval: flow_eval.search(input_fn=evaluation_generator(evaluate_num_docs, request_size, data_set, mode=evaluation_mode), on_done=print_evaluation_score) print(f'MeanReciprocalRank is: {sum_of_score / num_of_searches}')
def index(pdf_files): f = Flow.load_config('flows/index.yml') # f.plot() with f: with TimeContext(f'QPS: indexing {len(pdf_files)}', logger=f.logger): from jina.clients.helper import pprint_routes f.index(inputs=index_generator(data_path=pdf_files), read_mode='r', on_done=pprint_routes, request_size=1)
def index(data_set, num_docs, request_size): f = Flow.load_config('flow-index.yml') with f: with TimeContext(f'QPS: indexing {num_docs}', logger=f.logger): f.index( inputs=input_index_data(num_docs, request_size, data_set), request_size=request_size )
def test_load_flow_from_yaml(): with open(cur_dir.parent / 'yaml' / 'test-flow.yml') as fp: a = Flow.load_config(fp) with a: with open(str(cur_dir.parent / 'yaml' / 'swarm-out.yml'), 'w') as fp: a.to_swarm_yaml(fp) rm_files([str(cur_dir.parent / 'yaml' / 'swarm-out.yml')])
def query_pdf(): f = Flow.load_config('flows/query-only-pdf.yml') with f: print('pdf search:') f.search( input_fn=search_generator(data_path='toy_data/blog2-pages-1.pdf'), read_mode='r', on_done=get_pdf)
def port_expose(self) -> str: """ Sets `port_expose` for the Flow started in `mini-jinad`. NOTE: this port needs to be exposed before starting `mini-jinad`, hence set here. :return: port_expose """ f = Flow.load_config(str(self.localpath())) return f.port_expose or random_port()
def index(num_docs): flow = Flow.load_config('flows/index.yml') with flow: input_docs = input_generator(num_docs=num_docs) data_path = os.path.join(os.path.dirname(__file__), os.environ.get('JINA_DATA_FILE', None)) flow.logger.info(f'Indexing {data_path}') flow.post(on='/index', inputs=input_docs, request_size=10, show_progress=True)
def query_text(): f = Flow.load_config('flows/query-only-text.yml') with f: d = Document() search_text = 'It makes sense to first define what we mean by multimodality before going into morefancy terms.' # blog1 # search_text = 'We all know about CRUD[1]. Every app out there does it.'#blog2 # search_text = 'Developing a Jina app often means writing YAML configs.'#blog3 d.text = search_text print('text search:') f.search(input_fn=d, on_done=get_pdf)
def test_flow_identical(tmpdir): with open(os.path.join(cur_dir, '../../../yaml/test-flow.yml')) as fp: a = Flow.load_config(fp) b = (Flow().add(name='chunk_seg', shards=3).add(name='wqncode1', shards=2).add(name='encode2', shards=2, needs='chunk_seg').join( ['wqncode1', 'encode2'])) a.save_config(os.path.join(str(tmpdir), 'test2.yml')) c = Flow.load_config(os.path.join(str(tmpdir), 'test2.yml')) assert a == b assert a == c with a as f: _validate_flow(f)
def query(num_doc, target: dict): f = Flow.load_config('flows/query.yml') with f: with TimeContext(f'QPS: query with {num_doc}', logger=f.logger): f.search(query_generator(num_doc, target), shuffle=True, size=128, on_done=print_result, request_size=32, top_k=TOP_K) write_html(os.path.join(os.getenv('JINA_WORKDIR'), 'hello-world.html'))
def test_flow_yaml_dump(): f = Flow(logserver_config=str(cur_dir.parent / 'yaml' / 'test-server-config.yml'), optimize_level=FlowOptimizeLevel.IGNORE_GATEWAY, no_gateway=True) f.save_config('test1.yml') fl = Flow.load_config('test1.yml') assert f.args.logserver_config == fl.args.logserver_config assert f.args.optimize_level == fl.args.optimize_level rm_files(['test1.yml'])
def test_add_needs_inspect(tmpdir): f1 = (Flow().add(name='pod0', needs='gateway').add( name='pod1', needs='gateway').inspect().needs(['pod0', 'pod1'])) with f1: f1.index_ndarray(np.random.random([5, 5]), on_done=print) f2 = Flow.load_config('yaml/flow-v1.0-syntax.yml') with f2: f2.index_ndarray(np.random.random([5, 5]), on_done=print) assert f1 == f2
def test_add_needs_inspect(tmpdir): f1 = (Flow().add(name='executor0', needs='gateway').add( name='executor1', needs='gateway').inspect().needs(['executor0', 'executor1'])) with f1: _ = f1.index(from_ndarray(np.random.random([5, 5]))) f2 = Flow.load_config('yaml/flow-v1.0-syntax.yml') with f2: _ = f2.index(from_ndarray(np.random.random([5, 5]))) assert f1 == f2
def index(num_docs: int): # Runs indexing for all images num_docs = min( num_docs, len(glob(os.path.join(os.getcwd(), IMAGE_SRC), recursive=True))) with Flow.load_config('flows/index.yml') as flow: document_generator = from_files(IMAGE_SRC, size=num_docs) flow.post(on='/index', inputs=DocumentArray(document_generator), request_size=64, read_mode='rb')