def index(num_docs): config(num_docs, mode='index') data_path = os.path.join(os.environ['DATA_DIR'], os.environ['DATA_FILE']) f = Flow().load_config('flow-index.yml') with f: f.index_lines(filepath=data_path, batch_size=8, size=int(os.environ['MAX_NUM_DOCS']))
def index(documents, size, batch_size=32): f = Flow().load_config(INDEX_FLOW_YAML) with f: f.index_lines(lines=documents, size=size, batch_size=batch_size) print('- Indexing completed!')
def index_documents(): f = Flow().load_config(INDEX_FLOW_FILE_PATH) with f: f.index_lines(filepath=os.environ['JINA_DATA_FILE'], batch_size=8, size=NUM_DOCS)
def main(task, num_docs, top_k): workspace_path = '/tmp/jina/urbandict' os.environ['TMP_WORKSPACE'] = get_random_ws(workspace_path) print(f'{os.environ["TMP_WORKSPACE"]}') data_fn = os.environ.get('WASHED_DATA_DIR', os.path.join(workspace_path, 'urbandict-word-defs.csv')) if task == 'index': f = Flow().load_config('flow-index.yml') with f: f.index_lines(filepath=data_fn, size=num_docs, batch_size=16) elif task == 'query': f = Flow().load_config('flow-query.yml') with f: while True: text = input('word definition: ') if not text: break ppr = lambda x: print_topk(x, text) f.search_lines(lines=[text, ], output_fn=ppr, topk=top_k) elif task == 'query_restful': f = Flow().load_config('flow-query.yml') f.use_rest_gateway() with f: f.block() else: raise NotImplementedError( f'unknown task: {task}. A valid task is `index` or `query` or `query_restful`.')
def index(num_docs): f = Flow().load_config("flow-index.yml") with f: f.index_lines( filepath=os.environ["JINA_DATA_FILE"], batch_size=8, size=num_docs, )
def index(num_docs): f = Flow().load_config("flows/index.yml") with f: data_path = os.path.join(os.path.dirname(__file__), os.environ.get('JINA_DATA_FILE', None)) f.index_lines(filepath=data_path, batch_size=16, read_mode='r', size=num_docs)
def index_documents(): f = Flow().load_config(INDEX_FLOW_FILE_PATH) with f: data_path = os.path.join(os.path.dirname(__file__), os.environ.get('JINA_DATA_FILE', None)) f.index_lines(filepath=data_path, batch_size=16, read_mode='r', size=NUM_DOCS)
def index(num_docs): f = Flow().load_config('flow-index.yml') with f: print(f'Indexing file {os.environ["JINA_DATA_FILE_1"]}') f.index_lines( filepath=os.environ['JINA_DATA_FILE_1'], batch_size=8, size=num_docs, ) # we then re-use the same index to append new data with f: print(f'Indexing file {os.environ["JINA_DATA_FILE_2"]}') f.index_lines( filepath=os.environ['JINA_DATA_FILE_2'], batch_size=8, size=num_docs, )
def main(task, num_docs, top_k): os.environ['TMP_WORKSPACE'] = get_random_ws(os.environ['TMP_DATA_DIR']) data_path = os.path.join(os.environ['TMP_DATA_DIR'], 'character-lines.csv') if task == 'index': f = Flow().load_config('flow-index.yml') with f: f.index_lines(filepath=data_path, size=num_docs, batch_size=8) print('done') elif task == 'query': f = Flow().load_config('flow-query.yml') with f: while True: text = input('please type a sentence: ') if not text: break ppr = lambda x: print_topk(x, text) f.search(read_query_data(text), callback=ppr, topk=top_k) else: raise NotImplementedError( f'unknown task: {task}. A valid task is either `index` or `query`.')
def index(num_docs): f = Flow().load_config('flows/index.yml') with f: print(f'Indexing {os.environ["JINA_DATA_FILE_1"]}') data_path = os.path.join(os.path.dirname(__file__), os.environ.get('JINA_DATA_FILE_1', None)) f.index_lines(filepath=data_path, request_size=16, read_mode='r', size=num_docs) with f: print(f'Indexing {os.environ["JINA_DATA_FILE_2"]}') data_path = os.path.join(os.path.dirname(__file__), os.environ.get('JINA_DATA_FILE_2', None)) f.index_lines(filepath=data_path, request_size=16, read_mode='r', size=num_docs)
def test_sentencier_en_trim_spaces(self): """ Trimming all spaces at the beginning an end of the chunks. Keeping extra spaces inside chunks. Ignoring chunks with only spaces. """ sentencizer = Sentencizer() text = ' This , text is... . Amazing !!' chunks = [i['text'] for i in sentencizer.craft(text, 0)] locs = [i['location'] for i in sentencizer.craft(text, 0)] self.assertListEqual(chunks, ["This , text is...", "Amazing"]) self.assertEqual(text[locs[0][0]:locs[0][1]], ' This , text is...') self.assertEqual(text[locs[1][0]:locs[1][1]], ' Amazing') def validate(req): self.assertEqual(req.docs[0].chunks[0].text, 'This , text is...') self.assertEqual(req.docs[0].chunks[1].text, 'Amazing') f = Flow().add(yaml_path='!Sentencizer') with f: f.index_lines([' This , text is... . Amazing !!'], output_fn=validate, callback_on_body=True)
def test_sentencizer_en_trim_spaces(): """ Trimming all spaces at the beginning an end of the chunks. Keeping extra spaces inside chunks. Ignoring chunks with only spaces. """ sentencizer = Sentencizer() text = ' This , text is... . Amazing !!' chunks = [i['text'] for i in sentencizer.segment(text)] locs = [i['location'] for i in sentencizer.segment(text)] assert chunks, ["This , text is..." == "Amazing"] assert text[locs[0][0]:locs[0][1]], ' This == text is...' assert text[locs[1][0]:locs[1][1]] == ' Amazing' def validate(req): assert req.docs[0].chunks[0].text, 'This == text is...' assert req.docs[0].chunks[1].text == 'Amazing' f = Flow().add(uses='!Sentencizer') with f: f.index_lines([' This , text is... . Amazing !!'], on_done=validate, callback_on_body=True, line_format='csv')
def index_documents(): f = Flow().load_config(INDEX_FLOW_FILE_PATH) with f: f.index_lines(filepath=os.environ['JINA_DATA_FILE_1'], batch_size=8, size=NUM_DOCS) assert_index_size(50) # close flow and index new set of docs with f: f.index_lines(filepath=os.environ['JINA_DATA_FILE_2'], batch_size=8, size=NUM_DOCS) assert_index_size(100) # close flow and index same set of docs as in part 2 with f: f.index_lines(filepath=os.environ['JINA_DATA_FILE_2'], batch_size=8, size=NUM_DOCS) assert_index_size(100)
def main(task, num_docs, top_k): workspace_path = '/tmp/jina/urbandict' os.environ['WORKDIR'] = get_random_ws(workspace_path) data_fn = os.environ.get( 'WASHED_DATA_DIR', os.path.join(workspace_path, 'urbandict-word-defs.csv')) if task == 'index': workspace = os.environ['WORKDIR'] if os.path.exists(workspace): print( f'\n +---------------------------------------------------------------------------------+ \ \n | 🤖🤖🤖 | \ \n | The directory {workspace} already exists. Please remove it before indexing again. | \ \n | 🤖🤖🤖 | \ \n +---------------------------------------------------------------------------------+' ) f = Flow().load_config('flow-index.yml') with f: f.index_lines(filepath=data_fn, size=num_docs, batch_size=16) elif task == 'query': f = Flow().load_config('flow-query.yml') with f: while True: text = input('word definition: ') if not text: break ppr = lambda x: print_topk(x, text) f.search_lines(lines=[ text, ], output_fn=ppr, top_k=top_k) elif task == 'query_restful': f = Flow().load_config('flow-query.yml') f.use_rest_gateway() with f: f.block() else: raise NotImplementedError( f'unknown task: {task}. A valid task is `index` or `query` or `query_restful`.' )
def main(task, top_k, num_docs): if task == 'index': data_fn = os.path.join(workspace_path, "pre_web_text_zh_valid.json") flow = Flow().load_config('flow-index.yml') with flow: # flow.index_lines(lines=read_data(data_fn, num_docs), batch_size=32) flow.index_lines(filepath=data_fn, size=num_docs, batch_size=32) elif task == 'query': flow = Flow().load_config('flow-query.yml') with flow: while True: title = input('请输入问题: ') item = {'title': title} if not title: break ppr = lambda x: print_topk(x) flow.search(read_query_data(item), output_fn=ppr, topk=top_k) else: raise NotImplementedError( f'unknown task: {task}. A valid task is either `index` or `query`.' )
def main(task, top_k): if task == 'index': data_fn = os.path.join(workspace_path, "pre_news2016zh_valid.json") flow = Flow().load_config('flow-index.yml') with flow: flow.index_lines(filepath=data_fn, size=100, batch_size=32) elif task == 'query': flow = Flow().load_config('flow-query.yml') with flow: while True: content = input('请输入新闻内容: ') if not content: break item = {'content': content} ppr = lambda x: print_topk(x) flow.search(read_query_data(item), callback=ppr, top_k=top_k) else: raise NotImplementedError( f'unknown task: {task}. A valid task is either `index` or `query`.' )
def index(workspace, file, num_docs): """ Index a file ready for searching. """ try: from jina.flow import Flow except ImportError: raise ImportError( "Jina is not installed. Did you install the package with .[app] extras?" ) from stock import search, utils num_docs: Optional[int] = int( num_docs) if num_docs is not None else num_docs yml = utils.resource_filename("flow-index.yml") os.environ["SRC"] = os.path.dirname(yml) os.environ["WORKSPACE"] = workspace search.set_config() flow = Flow().load_config(yml) logger.debug(f"Loading config from {yml}") with flow: flow.index_lines(filepath=file, batch_size=8, size=num_docs)
def index(num_docs, max_docs, data_file): f = Flow().load_config('flow-index.yml') with f: f.index_lines(filepath=data_file, batch_size=8, size=max_docs)