def test_flow_with_modalities(tmpdir):
    os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE'] = str(tmpdir)

    def input_fn():
        doc1 = jina_pb2.Document()
        doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
        doc1.id = uid.new_doc_id(doc1)

        doc2 = jina_pb2.Document()
        doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
        doc2.id = uid.new_doc_id(doc2)

        doc3 = jina_pb2.Document()
        doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
        doc3.id = uid.new_doc_id(doc3)

        return [doc1, doc2, doc3]

    flow = Flow().add(name='crafter', uses='!MockSegmenter'). \
        add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml')). \
        add(name='indexer1', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'), needs=['encoder1']). \
        add(name='encoder2', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'), needs=['crafter']). \
        add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml')). \
        join(['indexer1', 'indexer2'])

    with flow:
        flow.index(input_fn=input_fn, override_doc_id=False)

    with open(tmpdir.join('vec1.gz'), 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(
            result,
            np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]))

    with open(tmpdir.join('vec2.gz'), 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(
            result,
            np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]))

    chunkIndexer1 = BinaryPbIndexer.load(tmpdir.join('kvidx1.bin'))
    assert chunkIndexer1.size == 3
    d_id = list(chunkIndexer1.query_handler.header.keys())[0]

    query_doc = jina_pb2.Document()
    query_doc.ParseFromString(chunkIndexer1.query(d_id))
    assert query_doc.text == 'title: this is mode1 from doc1'
    assert query_doc.modality == 'mode1'

    chunkIndexer2 = BinaryPbIndexer.load(tmpdir.join('kvidx2.bin'))
    assert chunkIndexer2.size == 3
    d_id = list(chunkIndexer2.query_handler.header.keys())[0]

    query_doc = jina_pb2.Document()
    query_doc.ParseFromString(chunkIndexer2.query(d_id))
    assert query_doc.text == ' body: this is mode2 from doc1'
    assert query_doc.modality == 'mode2'

    del os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE']
Exemple #2
0
def test_flow_with_modalities():
    def input_fn():
        doc1 = Document()
        doc1.id = 1
        doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
        doc2 = Document()
        doc2.id = 2
        doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
        doc3 = Document()
        doc3.id = 3
        doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
        return [doc1, doc2, doc3]

    flow = Flow().add(name='crafter', uses='!MockSegmenter'). \
        add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml')). \
        add(name='indexer1', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'), needs=['encoder1']). \
        add(name='encoder2', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'), needs=['crafter']). \
        add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml')). \
        join(['indexer1', 'indexer2'])

    with flow:
        flow.index(input_fn=input_fn)

    with open('vec1.gz', 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(
            result,
            np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]))

    with open('vec2.gz', 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(
            result,
            np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]))

    chunkIndexer1 = BinaryPbIndexer.load('kvidx1.bin')
    assert chunkIndexer1.size == 6
    d_id = list(chunkIndexer1.query_handler.header.keys())[0]
    assert chunkIndexer1.query(d_id).modality == 'mode1'

    chunkIndexer2 = BinaryPbIndexer.load('kvidx2.bin')
    assert chunkIndexer2.size == 6
    d_id = list(chunkIndexer2.query_handler.header.keys())[0]
    assert chunkIndexer2.query(d_id).modality == 'mode2'
def test_binary_pb():
    num_docs = 100
    docs = list(random_docs(num_docs, jitter=50))
    with BinaryPbIndexer('test-shelf') as spi:
        spi.add(docs)
        spi.save()

    with BinaryPbIndexer.load(spi.save_abspath) as spi:
        assert spi.size == num_docs
        for j in range(num_docs):
            assert spi.query(j) == docs[j]
def test_flow_with_modalities(tmpdir, restful):
    os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE'] = str(tmpdir)

    def input_function():
        doc1 = jina_pb2.DocumentProto()
        doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
        doc1.id = '1'

        doc2 = jina_pb2.DocumentProto()
        doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
        doc2.id = '2'

        doc3 = jina_pb2.DocumentProto()
        doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
        doc3.id = '3'

        return [doc1, doc2, doc3]

    flow = (
        Flow(restful=restful)
        .add(name='segmenter', uses='!MockSegmenter')
        .add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml'))
        .add(
            name='indexer1',
            uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'),
            needs=['encoder1'],
        )
        .add(
            name='encoder2',
            uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'),
            needs=['segmenter'],
        )
        .add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml'))
        .join(['indexer1', 'indexer2'])
    )

    with flow:
        flow.index(inputs=input_function)

    with open(os.path.join(tmpdir, 'compound', 'vecidx1-0', 'vec1.gz'), 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(
            result, np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
        )

    with open(os.path.join(tmpdir, 'compound', 'vecidx2-0', 'vec2.gz'), 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(
            result, np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])
        )

    chunkIndexer1 = BinaryPbIndexer.load(
        os.path.join(tmpdir, 'compound', 'kvidx1-0', 'kvidx1.bin')
    )
    assert chunkIndexer1.size == 3
    d_id = list(chunkIndexer1.query_handler.header.keys())[0]

    query_doc = jina_pb2.DocumentProto()
    query_doc.ParseFromString(chunkIndexer1.query([d_id])[0])
    assert query_doc.text == 'title: this is mode1 from doc1'
    assert query_doc.modality == 'mode1'

    chunkIndexer2 = BinaryPbIndexer.load(
        os.path.join(tmpdir, 'compound', 'kvidx2-0', 'kvidx2.bin')
    )
    assert chunkIndexer2.size == 3
    d_id = list(chunkIndexer2.query_handler.header.keys())[0]

    query_doc = jina_pb2.DocumentProto()
    query_doc.ParseFromString(chunkIndexer2.query([d_id])[0])
    assert query_doc.text == ' body: this is mode2 from doc1'
    assert query_doc.modality == 'mode2'

    del os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE']