コード例 #1
0
def index(num_docs):
    config(num_docs, mode='index')
    data_path = os.path.join(os.environ['DATA_DIR'], os.environ['DATA_FILE'])
    f = Flow().load_config('flow-index.yml')
    with f:
        f.index_lines(filepath=data_path, batch_size=8,
                      size=int(os.environ['MAX_NUM_DOCS']))
コード例 #2
0
ファイル: app.py プロジェクト: madhukar01/examples
def index(documents, size, batch_size=32):
    f = Flow().load_config(INDEX_FLOW_YAML)
    with f:
        f.index_lines(lines=documents,
                      size=size, 
                      batch_size=batch_size)
    print('- Indexing completed!')
コード例 #3
0
def index_documents():
    f = Flow().load_config(INDEX_FLOW_FILE_PATH)

    with f:
        f.index_lines(filepath=os.environ['JINA_DATA_FILE'],
                      batch_size=8,
                      size=NUM_DOCS)
コード例 #4
0
ファイル: app.py プロジェクト: rutulgandhi05/examples
def main(task, num_docs, top_k):
    workspace_path = '/tmp/jina/urbandict'
    os.environ['TMP_WORKSPACE'] = get_random_ws(workspace_path)
    print(f'{os.environ["TMP_WORKSPACE"]}')
    data_fn = os.environ.get('WASHED_DATA_DIR', os.path.join(workspace_path, 'urbandict-word-defs.csv'))
    if task == 'index':
        f = Flow().load_config('flow-index.yml')
        with f:
            f.index_lines(filepath=data_fn, size=num_docs, batch_size=16)
    elif task == 'query':
        f = Flow().load_config('flow-query.yml')
        with f:
            while True:
                text = input('word definition: ')
                if not text:
                    break
                ppr = lambda x: print_topk(x, text)
                f.search_lines(lines=[text, ], output_fn=ppr, topk=top_k)
    elif task == 'query_restful':
        f = Flow().load_config('flow-query.yml')
        f.use_rest_gateway()
        with f:
            f.block()
    else:
        raise NotImplementedError(
            f'unknown task: {task}. A valid task is `index` or `query` or `query_restful`.')
コード例 #5
0
def index(num_docs):
    f = Flow().load_config("flow-index.yml")

    with f:
        f.index_lines(
            filepath=os.environ["JINA_DATA_FILE"],
            batch_size=8,
            size=num_docs,
        )
コード例 #6
0
def index(num_docs):
    f = Flow().load_config("flows/index.yml")

    with f:
        data_path = os.path.join(os.path.dirname(__file__),
                                 os.environ.get('JINA_DATA_FILE', None))
        f.index_lines(filepath=data_path,
                      batch_size=16,
                      read_mode='r',
                      size=num_docs)
コード例 #7
0
def index_documents():
    f = Flow().load_config(INDEX_FLOW_FILE_PATH)

    with f:
        data_path = os.path.join(os.path.dirname(__file__),
                                 os.environ.get('JINA_DATA_FILE', None))
        f.index_lines(filepath=data_path,
                      batch_size=16,
                      read_mode='r',
                      size=NUM_DOCS)
コード例 #8
0
ファイル: app.py プロジェクト: saman-moeinsadat/examples
def index(num_docs):
    f = Flow().load_config('flow-index.yml')

    with f:
        print(f'Indexing file {os.environ["JINA_DATA_FILE_1"]}')
        f.index_lines(
            filepath=os.environ['JINA_DATA_FILE_1'],
            batch_size=8,
            size=num_docs,
        )

    # we then re-use the same index to append new data
    with f:
        print(f'Indexing file {os.environ["JINA_DATA_FILE_2"]}')
        f.index_lines(
            filepath=os.environ['JINA_DATA_FILE_2'],
            batch_size=8,
            size=num_docs,
        )
コード例 #9
0
ファイル: app.py プロジェクト: vinnu1812/examples
def main(task, num_docs, top_k):
    os.environ['TMP_WORKSPACE'] = get_random_ws(os.environ['TMP_DATA_DIR'])
    data_path = os.path.join(os.environ['TMP_DATA_DIR'], 'character-lines.csv')
    if task == 'index':
        f = Flow().load_config('flow-index.yml')
        with f:
            f.index_lines(filepath=data_path, size=num_docs, batch_size=8)
        print('done')
    elif task == 'query':
        f = Flow().load_config('flow-query.yml')
        with f:
            while True:
                text = input('please type a sentence: ')
                if not text:
                    break
                ppr = lambda x: print_topk(x, text)
                f.search(read_query_data(text), callback=ppr, topk=top_k)
    else:
        raise NotImplementedError(
            f'unknown task: {task}. A valid task is either `index` or `query`.')
コード例 #10
0
def index(num_docs):
    f = Flow().load_config('flows/index.yml')

    with f:
        print(f'Indexing {os.environ["JINA_DATA_FILE_1"]}')
        data_path = os.path.join(os.path.dirname(__file__),
                                 os.environ.get('JINA_DATA_FILE_1', None))
        f.index_lines(filepath=data_path,
                      request_size=16,
                      read_mode='r',
                      size=num_docs)

    with f:
        print(f'Indexing {os.environ["JINA_DATA_FILE_2"]}')
        data_path = os.path.join(os.path.dirname(__file__),
                                 os.environ.get('JINA_DATA_FILE_2', None))
        f.index_lines(filepath=data_path,
                      request_size=16,
                      read_mode='r',
                      size=num_docs)
コード例 #11
0
ファイル: split.py プロジェクト: alfred297/Pooja-AI
    def test_sentencier_en_trim_spaces(self):
        """
        Trimming all spaces at the beginning an end of the chunks.
        Keeping extra spaces inside chunks.
        Ignoring chunks with only spaces.
        """
        sentencizer = Sentencizer()
        text = '  This ,  text is...  . Amazing !!'
        chunks = [i['text'] for i in sentencizer.craft(text, 0)]
        locs = [i['location'] for i in sentencizer.craft(text, 0)]
        self.assertListEqual(chunks, ["This ,  text is...", "Amazing"])
        self.assertEqual(text[locs[0][0]:locs[0][1]], '  This ,  text is...')
        self.assertEqual(text[locs[1][0]:locs[1][1]], ' Amazing')

        def validate(req):
            self.assertEqual(req.docs[0].chunks[0].text, 'This ,  text is...')
            self.assertEqual(req.docs[0].chunks[1].text, 'Amazing')

        f = Flow().add(yaml_path='!Sentencizer')
        with f:
            f.index_lines(['  This ,  text is...  . Amazing !!'], output_fn=validate, callback_on_body=True)
コード例 #12
0
def test_sentencizer_en_trim_spaces():
    """
    Trimming all spaces at the beginning an end of the chunks.
    Keeping extra spaces inside chunks.
    Ignoring chunks with only spaces.
    """
    sentencizer = Sentencizer()
    text = '  This ,  text is...  . Amazing !!'
    chunks = [i['text'] for i in sentencizer.segment(text)]
    locs = [i['location'] for i in sentencizer.segment(text)]
    assert chunks, ["This ,  text is..." == "Amazing"]
    assert text[locs[0][0]:locs[0][1]], '  This  ==   text is...'
    assert text[locs[1][0]:locs[1][1]] == ' Amazing'

    def validate(req):
        assert req.docs[0].chunks[0].text, 'This  ==   text is...'
        assert req.docs[0].chunks[1].text == 'Amazing'

    f = Flow().add(uses='!Sentencizer')
    with f:
        f.index_lines(['  This ,  text is...  . Amazing !!'], on_done=validate, callback_on_body=True, line_format='csv')
コード例 #13
0
def index_documents():
    f = Flow().load_config(INDEX_FLOW_FILE_PATH)

    with f:
        f.index_lines(filepath=os.environ['JINA_DATA_FILE_1'],
                      batch_size=8,
                      size=NUM_DOCS)

    assert_index_size(50)

    # close flow and index new set of docs
    with f:
        f.index_lines(filepath=os.environ['JINA_DATA_FILE_2'],
                      batch_size=8,
                      size=NUM_DOCS)

    assert_index_size(100)

    # close flow and index same set of docs as in part 2
    with f:
        f.index_lines(filepath=os.environ['JINA_DATA_FILE_2'],
                      batch_size=8,
                      size=NUM_DOCS)

    assert_index_size(100)
コード例 #14
0
ファイル: app.py プロジェクト: saman-moeinsadat/examples
def main(task, num_docs, top_k):
    workspace_path = '/tmp/jina/urbandict'
    os.environ['WORKDIR'] = get_random_ws(workspace_path)
    data_fn = os.environ.get(
        'WASHED_DATA_DIR',
        os.path.join(workspace_path, 'urbandict-word-defs.csv'))
    if task == 'index':
        workspace = os.environ['WORKDIR']
        if os.path.exists(workspace):
            print(
                f'\n +---------------------------------------------------------------------------------+ \
                    \n |                                   🤖🤖🤖                                        | \
                    \n | The directory {workspace} already exists. Please remove it before indexing again. | \
                    \n |                                   🤖🤖🤖                                        | \
                    \n +---------------------------------------------------------------------------------+'
            )
        f = Flow().load_config('flow-index.yml')
        with f:
            f.index_lines(filepath=data_fn, size=num_docs, batch_size=16)
    elif task == 'query':
        f = Flow().load_config('flow-query.yml')
        with f:
            while True:
                text = input('word definition: ')
                if not text:
                    break
                ppr = lambda x: print_topk(x, text)
                f.search_lines(lines=[
                    text,
                ], output_fn=ppr, top_k=top_k)
    elif task == 'query_restful':
        f = Flow().load_config('flow-query.yml')
        f.use_rest_gateway()
        with f:
            f.block()
    else:
        raise NotImplementedError(
            f'unknown task: {task}. A valid task is `index` or `query` or `query_restful`.'
        )
コード例 #15
0
def main(task, top_k, num_docs):
    if task == 'index':
        data_fn = os.path.join(workspace_path, "pre_web_text_zh_valid.json")
        flow = Flow().load_config('flow-index.yml')
        with flow:
            # flow.index_lines(lines=read_data(data_fn, num_docs), batch_size=32)
            flow.index_lines(filepath=data_fn, size=num_docs, batch_size=32)

    elif task == 'query':
        flow = Flow().load_config('flow-query.yml')
        with flow:
            while True:
                title = input('请输入问题: ')
                item = {'title': title}
                if not title:
                    break
                ppr = lambda x: print_topk(x)
                flow.search(read_query_data(item), output_fn=ppr, topk=top_k)
    else:
        raise NotImplementedError(
            f'unknown task: {task}. A valid task is either `index` or `query`.'
        )
コード例 #16
0
def main(task, top_k):
    if task == 'index':
        data_fn = os.path.join(workspace_path, "pre_news2016zh_valid.json")
        flow = Flow().load_config('flow-index.yml')
        with flow:
            flow.index_lines(filepath=data_fn, size=100, batch_size=32)

    elif task == 'query':
        flow = Flow().load_config('flow-query.yml')
        with flow:
            while True:
                content = input('请输入新闻内容: ')
                if not content:
                    break
                item = {'content': content}

                ppr = lambda x: print_topk(x)
                flow.search(read_query_data(item), callback=ppr, top_k=top_k)
    else:
        raise NotImplementedError(
            f'unknown task: {task}. A valid task is either `index` or `query`.'
        )
コード例 #17
0
def index(workspace, file, num_docs):
    """
    Index a file ready for searching.
    """
    try:
        from jina.flow import Flow
    except ImportError:
        raise ImportError(
            "Jina is not installed. Did you install the package with .[app] extras?"
        )
    from stock import search, utils

    num_docs: Optional[int] = int(
        num_docs) if num_docs is not None else num_docs

    yml = utils.resource_filename("flow-index.yml")
    os.environ["SRC"] = os.path.dirname(yml)
    os.environ["WORKSPACE"] = workspace

    search.set_config()
    flow = Flow().load_config(yml)
    logger.debug(f"Loading config from {yml}")
    with flow:
        flow.index_lines(filepath=file, batch_size=8, size=num_docs)
コード例 #18
0
ファイル: app.py プロジェクト: carlosb1/examples-python
def index(num_docs, max_docs, data_file):
    f = Flow().load_config('flow-index.yml')
    with f:
        f.index_lines(filepath=data_file, batch_size=8,
                      size=max_docs)