def main(task, batch_size, top_k): os.environ['WORKDIR'] = get_random_ws(os.environ['TMP_DATA_DIR']) if task == 'index': data_path = os.path.join(os.environ['TMP_DATA_DIR'], 'siftsmall_base.fvecs') if os.path.exists(data_path): print( f'\n +---------------------------------------------------------------------------------+ \ \n | ������ | \ \n | The directory {data_path} already exists. Please remove it before indexing again. | \ \n | ������ | \ \n +---------------------------------------------------------------------------------+' ) flow = Flow().load_config('flow-index.yml') with flow.build() as fl: fl.index_ndarray(read_data(data_path), batch_size=batch_size) elif task == 'query': data_path = os.path.join(os.environ['TMP_DATA_DIR'], 'siftsmall_query.fvecs') flow = Flow().load_config('flow-query.yml') with flow.build() as fl: ppr = lambda x: save_topk( x, os.path.join(os.environ['TMP_DATA_DIR'], 'query_results.txt' ), top_k) fl.search_ndarray(read_data(data_path), output_fn=ppr, top_k=top_k) else: raise NotImplementedError( f'unknown task: {task}. A valid task is either `index` or `query`.' )
def main(task, num_docs, top_k, path): os.environ['TMP_WORKSPACE'] = get_random_ws(os.environ['TMP_DATA_DIR']) data_path = os.path.join(os.environ['TMP_DATA_DIR'], 'jpg') if task == 'index': flow = Flow().load_config('flow-index.yml') with flow.build() as fl: fl.index(raw_bytes=read_data(data_path, num_docs), batch_size=2) elif task == 'query': if not path: cmd_prompt = '\033[{}mpython {} -t query -p <JPG file or directory>\033[0m'.format(32, sys.argv[0]) prompt = input(f'You can specify a JPG file or directory you own to query: {cmd_prompt}\nDo you want? Please input y or n: ') if prompt and prompt == 'y': sys.exit(0) read_data_fn = read_custom_data if path else read_data if path: data_path = path flow = Flow().load_config('flow-query.yml') with flow.build() as fl: ppr = lambda x: save_topk(x, os.path.join(os.environ['TMP_DATA_DIR'], 'query_results.png')) fl.search(read_data_fn(data_path, 5), callback=ppr, top_k=top_k) else: raise NotImplementedError( f'unknown task: {task}. A valid task is either `index` or `query`.')
def main(task, num_docs, top_k): os.environ['TMP_WORKSPACE'] = get_random_ws(os.environ['TMP_DATA_DIR']) data_path = os.path.join(os.environ['TMP_DATA_DIR'], 'jpg') if task == 'index': flow = Flow().load_config('flow-index.yml') with flow.build() as fl: fl.index(raw_bytes=read_data(data_path, num_docs), batch_size=2) elif task == 'query': flow = Flow().load_config('flow-query.yml') with flow.build() as fl: ppr = lambda x: save_topk(x, os.path.join(os.environ['TMP_DATA_DIR'], 'query_results.png')) fl.search(read_data(data_path, 5), callback=ppr, top_k=top_k) else: raise NotImplementedError( f'unknown task: {task}. A valid task is either `index` or `query`.')
def main(task, num_docs, top_k): os.environ['TMP_WORKSPACE'] = get_random_ws(os.environ['TMP_DATA_DIR']) data_path = os.path.join(os.environ['TMP_DATA_DIR'], 'character-lines.csv') if task == 'index': flow = Flow().load_config('flow-index.yml') with flow.build() as fl: fl.index(buffer=read_data(data_path, num_docs), batch_size=8) print('done') elif task == 'query': flow = Flow().load_config('flow-query.yml') with flow.build() as fl: while True: text = input('please type a sentence: ') if not text: break ppr = lambda x: print_topk(x, text) fl.search(read_query_data(text), callback=ppr, topk=top_k) else: raise NotImplementedError( f'unknown task: {task}. A valid task is either `index` or `query`.' )
def main(task, batch_size, top_k): os.environ['TMP_WORKSPACE'] = get_random_ws(os.environ['TMP_DATA_DIR']) if task == 'index': data_path = os.path.join(os.environ['TMP_DATA_DIR'], 'siftsmall_base.fvecs') flow = Flow().load_config('flow-index.yml') with flow.build() as fl: fl.index_ndarray(read_data(data_path), batch_size=batch_size) elif task == 'query': data_path = os.path.join(os.environ['TMP_DATA_DIR'], 'siftsmall_query.fvecs') flow = Flow().load_config('flow-query.yml') with flow.build() as fl: ppr = lambda x: save_topk( x, os.path.join(os.environ['TMP_DATA_DIR'], 'query_results.txt' ), top_k) fl.search_ndarray(read_data(data_path), output_fn=ppr, top_k=top_k) else: raise NotImplementedError( f'unknown task: {task}. A valid task is either `index` or `query`.' )
def main(task, num_docs, top_k): workspace_path = '/tmp/jina/urbandict' os.environ['TMP_WORKSPACE'] = get_random_ws(workspace_path) print(f'{os.environ["TMP_WORKSPACE"]}') data_fn = os.path.join('/tmp/jina/urbandict', "urbandict-word-defs.json") if task == 'index': flow = Flow().load_config('flow-index.yml') with flow.build() as fl: fl.index(buffer=read_data(data_fn, num_docs), batch_size=16) elif task == 'query': flow = Flow().load_config('flow-query.yml') with flow.build() as fl: while True: text = input('word definition: ') if not text: break ppr = lambda x: print_topk(x, text) fl.search(read_query_data(text), callback=ppr, topk=top_k) else: raise NotImplementedError( f'unknown task: {task}. A valid task is either `index` or `query`.')
def f2(): f = Flow(logserver=True, logserver_config='test-server-config.yml').add( yaml_path='gif2chunk2.yml', replicas=replicas) def bytes_gen(): idx = 0 for g in glob.glob(GIF_BLOB)[:num_docs]: with open(g, 'rb') as fp: # print(f'im asking to read {idx}') yield fp.read() idx += 1 # for idx, request in enumerate(bytes_gen()): # print(idx) with f.build() as fl: fl.index(bytes_gen(), batch_size=8)
def f1(): f = Flow().add(yaml_path='gif2chunk.yml', replicas=replicas) bytes_gen = (g.encode() for g in glob.glob(GIF_BLOB)[:num_docs]) with f.build() as fl: fl.index(bytes_gen, batch_size=128)