Beispiel #1
0
def test_flow_with_modalities(tmpdir):
    os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE'] = str(tmpdir)

    def input_fn():
        doc1 = jina_pb2.Document()
        doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
        doc1.id = uid.new_doc_id(doc1)

        doc2 = jina_pb2.Document()
        doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
        doc2.id = uid.new_doc_id(doc2)

        doc3 = jina_pb2.Document()
        doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
        doc3.id = uid.new_doc_id(doc3)

        return [doc1, doc2, doc3]

    flow = Flow().add(name='crafter', uses='!MockSegmenter'). \
        add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml')). \
        add(name='indexer1', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'), needs=['encoder1']). \
        add(name='encoder2', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'), needs=['crafter']). \
        add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml')). \
        join(['indexer1', 'indexer2'])

    with flow:
        flow.index(input_fn=input_fn, override_doc_id=False)

    with open(tmpdir.join('vec1.gz'), 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(
            result,
            np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]))

    with open(tmpdir.join('vec2.gz'), 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(
            result,
            np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]))

    chunkIndexer1 = BinaryPbIndexer.load(tmpdir.join('kvidx1.bin'))
    assert chunkIndexer1.size == 3
    d_id = list(chunkIndexer1.query_handler.header.keys())[0]

    query_doc = jina_pb2.Document()
    query_doc.ParseFromString(chunkIndexer1.query(d_id))
    assert query_doc.text == 'title: this is mode1 from doc1'
    assert query_doc.modality == 'mode1'

    chunkIndexer2 = BinaryPbIndexer.load(tmpdir.join('kvidx2.bin'))
    assert chunkIndexer2.size == 3
    d_id = list(chunkIndexer2.query_handler.header.keys())[0]

    query_doc = jina_pb2.Document()
    query_doc.ParseFromString(chunkIndexer2.query(d_id))
    assert query_doc.text == ' body: this is mode2 from doc1'
    assert query_doc.modality == 'mode2'

    del os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE']
Beispiel #2
0
    def test_flow_with_modalitys(self):
        def input_fn():
            doc1 = Document()
            doc1.id = 1
            doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
            doc2 = Document()
            doc2.id = 2
            doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
            doc3 = Document()
            doc3.id = 3
            doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
            return [doc1, doc2, doc3]

        flow = Flow().add(name='crafter', uses='!MockSegmenter'). \
            add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml')). \
            add(name='indexer1', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'), needs=['encoder1']). \
            add(name='encoder2', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'), needs=['crafter']). \
            add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml')). \
            join(['indexer1', 'indexer2'])

        self.add_tmpfile('vec1.gz')
        self.add_tmpfile('vec2.gz')
        self.add_tmpfile('chunk1.gz')
        self.add_tmpfile('chunk2.gz')
        self.add_tmpfile('vecidx1.bin')
        self.add_tmpfile('vecidx2.bin')
        self.add_tmpfile('kvidx1.bin')
        self.add_tmpfile('kvidx2.bin')

        with flow:
            flow.index(input_fn=input_fn)

        with gzip.open('vec1.gz', 'rb') as fp:
            result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
            np.testing.assert_equal(
                result,
                np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]))

        with gzip.open('vec2.gz', 'rb') as fp:
            result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
            np.testing.assert_equal(
                result,
                np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]))

        chunkIndexer1 = BinaryPbIndexer(index_filename='chunk1.gz')
        self.assertEqual(len(chunkIndexer1.query_handler.items()), 3)
        for key, pb in chunkIndexer1.query_handler.items():
            for chunk in pb.chunks:
                self.assertEqual(chunk.modality, 'mode1')

        chunkIndexer2 = BinaryPbIndexer(index_filename='chunk2.gz')
        self.assertEqual(len(chunkIndexer2.query_handler.items()), 3)
        for key, pb in chunkIndexer2.query_handler.items():
            for chunk in pb.chunks:
                self.assertEqual(chunk.modality, 'mode2')
def test_binary_pb():
    num_docs = 100
    docs = list(random_docs(num_docs, jitter=50))
    with BinaryPbIndexer('test-shelf') as spi:
        spi.add(docs)
        spi.save()

    with BinaryPbIndexer.load(spi.save_abspath) as spi:
        assert spi.size == num_docs
        for j in range(num_docs):
            assert spi.query(j) == docs[j]
def test_binarypb_add_and_update_not_working(test_metas, delete_on_dump):
    with BinaryPbIndexer(metas=test_metas,
                         delete_on_dump=delete_on_dump) as idxer:
        idxer.add(['11', '12', '13'], [b'eleven', b'twelve', b'thirteen'])
        idxer.save()
        # FIXME `add` and `update` won't work in the same context
        # since `.save` calls `.flush` on a closed handler
        # and the handler needs to have been
        # closed for us to allow querying in the `.update`
        with pytest.raises(AttributeError):
            idxer.update(['12'], [b'twelve-new'])
            idxer.save()
        assert idxer.size == 3
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        idxer.update(['12'], [b'twelve-new'])
        idxer.save()

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(['11']) == [b'eleven']
        assert idxer.query(['12']) == [b'twelve-new']
        assert idxer.query(['12', '13']) == [b'twelve-new', b'thirteen']
        assert idxer.size == 3
        assert idxer.sample() in (b'eleven', b'twelve-new', b'thirteen')
def test_binarypb_update1(test_metas, delete_on_dump):
    with BinaryPbIndexer(metas=test_metas,
                         delete_on_dump=delete_on_dump) as idxer:
        idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random'])
        idxer.save()
        assert idxer.size == 3

    first_size = os.path.getsize(idxer.index_abspath)
    save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(['1']) == [b'oldvalue']

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(['1']) == [b'oldvalue']

    second_size = os.path.getsize(idxer.index_abspath)
    assert second_size == first_size

    with BaseIndexer.load(save_abspath) as idxer:
        # some new value
        idxer.update(['1', '2'], [b'newvalue', b'same'])
        idxer.save()

    third_size = os.path.getsize(idxer.index_abspath)
    if delete_on_dump:
        assert third_size == first_size
    else:
        assert third_size > first_size
    assert idxer.size == 3

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(['1']) == [b'newvalue']
        assert idxer.query(['2']) == [b'same']
        assert idxer.query(['3']) == [b'random']
        assert idxer.query(['99']) == [None]

    with BaseIndexer.load(save_abspath) as idxer:
        # partial update when missing keys encountered
        idxer.update(['1', '2', '99'],
                     [b'abcvalue', b'abcd', b'WILL_BE_IGNORED'])
        idxer.save()
        assert idxer.size == 3

    fourth_size = os.path.getsize(idxer.index_abspath)
    if delete_on_dump:
        assert fourth_size == first_size
    else:
        assert fourth_size > first_size
    assert idxer.size == 3

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(['1']) == [b'abcvalue']
        assert idxer.query(['2']) == [b'abcd']
        assert idxer.query(['3']) == [b'random']
        assert idxer.query(['99']) == [None]
        assert idxer.query(['1', '2']) == [b'abcvalue', b'abcd']
        assert idxer.query(['1', '2',
                            '3']) == [b'abcvalue', b'abcd', b'random']
def test_kvindexer_iterate(test_metas):
    """two updates in a row does work"""
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random'])
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        assert list(idxer) == [[b'oldvalue'], [b'same'], [b'random']]
Beispiel #7
0
def test_flow_with_modalities():
    def input_fn():
        doc1 = Document()
        doc1.id = 1
        doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
        doc2 = Document()
        doc2.id = 2
        doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
        doc3 = Document()
        doc3.id = 3
        doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
        return [doc1, doc2, doc3]

    flow = Flow().add(name='crafter', uses='!MockSegmenter'). \
        add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml')). \
        add(name='indexer1', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'), needs=['encoder1']). \
        add(name='encoder2', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'), needs=['crafter']). \
        add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml')). \
        join(['indexer1', 'indexer2'])

    with flow:
        flow.index(input_fn=input_fn)

    with open('vec1.gz', 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(
            result,
            np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]))

    with open('vec2.gz', 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(
            result,
            np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]))

    chunkIndexer1 = BinaryPbIndexer.load('kvidx1.bin')
    assert chunkIndexer1.size == 6
    d_id = list(chunkIndexer1.query_handler.header.keys())[0]
    assert chunkIndexer1.query(d_id).modality == 'mode1'

    chunkIndexer2 = BinaryPbIndexer.load('kvidx2.bin')
    assert chunkIndexer2.size == 6
    d_id = list(chunkIndexer2.query_handler.header.keys())[0]
    assert chunkIndexer2.query(d_id).modality == 'mode2'
Beispiel #8
0
def test_binarypb_update_twice(test_metas):
    """two updates in a row does work"""
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random'])
        idxer.save()
        assert idxer.size == 3
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        idxer.update(['1'], [b'newvalue'])
        idxer.update(['2'], [b'othernewvalue'])
        idxer.save()

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query('1') == b'newvalue'
        assert idxer.query('2') == b'othernewvalue'
Beispiel #9
0
def test_binarypb_delete(test_metas):
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random'])
        idxer.save()
        assert idxer.size == 3
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query('1') == b'oldvalue'

    with BaseIndexer.load(save_abspath) as idxer:
        idxer.delete(iter(['1', '2']))
        idxer.save()
        assert idxer.size == 1

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query('1') is None
        assert idxer.query('2') is None
        assert idxer.query('3') == b'random'
Beispiel #10
0
def test_binarypb_delete(test_metas):
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add([1, 2, 3], [b'oldvalue', b'same', b'random'])
        idxer.save()
        assert idxer.size == 3
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(1) == b'oldvalue'

    with BaseIndexer.load(save_abspath) as idxer:
        idxer.delete(iter([1, 2]))
        idxer.save()
        assert idxer.size == 1

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(1) == None
        assert idxer.query(2) == None
        assert idxer.query(3) == b'random'
Beispiel #11
0
def test_binarypb_update1(test_metas):
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add(['1', '2', '3'], [b'oldvalue', b'same', b'random'])
        idxer.save()
        assert idxer.size == 3
        first_size = os.fstat(idxer.write_handler.body.fileno()).st_size
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query('1') == b'oldvalue'

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query('1') == b'oldvalue'
        second_size = os.fstat(idxer.query_handler._body.fileno()).st_size
        assert second_size == first_size

    with BaseIndexer.load(save_abspath) as idxer:
        # some new value
        idxer.update(['1', '2'], [b'newvalue', b'same'])
        idxer.save()
        third_size = os.fstat(idxer.write_handler.body.fileno()).st_size
        assert third_size > first_size
        assert idxer.size == 3

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query('1') == b'newvalue'
        assert idxer.query('2') == b'same'
        assert idxer.query('3') == b'random'
        assert idxer.query('99') is None

    with BaseIndexer.load(save_abspath) as idxer:
        # partial update when missing keys encountered
        idxer.update(['1', '2', '99'], [b'newvalue2', b'newvalue3', b'decoy'])
        idxer.save()
        assert idxer.size == 3

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query('1') == b'newvalue2'
        assert idxer.query('2') == b'newvalue3'
        assert idxer.query('3') == b'random'
        assert idxer.query('99') is None
Beispiel #12
0
def test_binarypb_add_and_update_not_working(test_metas):
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add([11, 12], [b'eleven', b'twelve'])
        idxer.save()
        # FIXME `add` and `update` won't work in the same context
        # since `.save` calls `.flush` on a closed handler
        # and the handler needs to have been
        # closed for us to allow querying in the `.update`
        with pytest.raises(AttributeError):
            idxer.update([12], [b'twelve-new'])
            idxer.save()
        assert idxer.size == 2
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        idxer.update([12], [b'twelve-new'])
        idxer.save()

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(11) == b'eleven'
        assert idxer.query(12) == b'twelve-new'
        assert idxer.size == 2
Beispiel #13
0
def test_binarypb_benchmark(test_metas, delete_on_dump):
    entries = 100000
    nr_to_update = 10000
    keys = np.arange(entries)
    values = np.random.randint(0, 10, size=entries).astype(bytes)

    with BinaryPbIndexer(metas=test_metas, delete_on_dump=delete_on_dump) as idxer:
        idxer.add(keys, values)
        idxer.save()
        assert idxer.size == entries
        save_abspath = idxer.save_abspath

    new_values = np.random.randint(0, 10, size=nr_to_update).astype(bytes)

    with BaseIndexer.load(save_abspath) as idxer:
        idxer.update(keys[:nr_to_update], new_values)
        time_now = time.time()
        idxer.save()

    time_end = time.time()
    print(
        f'delete_on_dump = {delete_on_dump}, entries={entries}. took {time_end - time_now} seconds'
    )
Beispiel #14
0
def test_binarypb_update1(test_metas):
    with BinaryPbIndexer(metas=test_metas) as idxer:
        idxer.add([1, 2, 3], [b'oldvalue', b'same', b'random'])
        idxer.save()
        assert idxer.size == 3
        first_size = os.fstat(idxer.write_handler.body.fileno()).st_size
        save_abspath = idxer.save_abspath

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(1) == b'oldvalue'

    with BaseIndexer.load(save_abspath) as idxer:
        # no update triggered AT ALL when encountering missing key
        # atomic op. at indexer level
        with pytest.raises(KeyError):
            idxer.update([1, 2, 99], [b'newvalue', b'same', b'decoy'])

        idxer.save()

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(1) == b'oldvalue'
        second_size = os.fstat(idxer.query_handler._body.fileno()).st_size
        assert second_size == first_size

    with BaseIndexer.load(save_abspath) as idxer:
        # some new value
        idxer.update([1, 2], [b'newvalue', b'same'])
        idxer.save()
        third_size = os.fstat(idxer.write_handler.body.fileno()).st_size
        assert third_size > first_size
        assert idxer.size == 3

    with BaseIndexer.load(save_abspath) as idxer:
        assert idxer.query(1) == b'newvalue'
        assert idxer.query(2) == b'same'
        assert idxer.query(3) == b'random'
        assert idxer.query(99) is None
Beispiel #15
0
def test_flow_with_modalities(tmpdir, restful):
    os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE'] = str(tmpdir)

    def input_function():
        doc1 = jina_pb2.DocumentProto()
        doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
        doc1.id = '1'

        doc2 = jina_pb2.DocumentProto()
        doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
        doc2.id = '2'

        doc3 = jina_pb2.DocumentProto()
        doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
        doc3.id = '3'

        return [doc1, doc2, doc3]

    flow = (
        Flow(restful=restful)
        .add(name='segmenter', uses='!MockSegmenter')
        .add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml'))
        .add(
            name='indexer1',
            uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'),
            needs=['encoder1'],
        )
        .add(
            name='encoder2',
            uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'),
            needs=['segmenter'],
        )
        .add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml'))
        .join(['indexer1', 'indexer2'])
    )

    with flow:
        flow.index(inputs=input_function)

    with open(os.path.join(tmpdir, 'compound', 'vecidx1-0', 'vec1.gz'), 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(
            result, np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
        )

    with open(os.path.join(tmpdir, 'compound', 'vecidx2-0', 'vec2.gz'), 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(
            result, np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])
        )

    chunkIndexer1 = BinaryPbIndexer.load(
        os.path.join(tmpdir, 'compound', 'kvidx1-0', 'kvidx1.bin')
    )
    assert chunkIndexer1.size == 3
    d_id = list(chunkIndexer1.query_handler.header.keys())[0]

    query_doc = jina_pb2.DocumentProto()
    query_doc.ParseFromString(chunkIndexer1.query([d_id])[0])
    assert query_doc.text == 'title: this is mode1 from doc1'
    assert query_doc.modality == 'mode1'

    chunkIndexer2 = BinaryPbIndexer.load(
        os.path.join(tmpdir, 'compound', 'kvidx2-0', 'kvidx2.bin')
    )
    assert chunkIndexer2.size == 3
    d_id = list(chunkIndexer2.query_handler.header.keys())[0]

    query_doc = jina_pb2.DocumentProto()
    query_doc.ParseFromString(chunkIndexer2.query([d_id])[0])
    assert query_doc.text == ' body: this is mode2 from doc1'
    assert query_doc.modality == 'mode2'

    del os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE']