Ejemplo n.º 1
0
def test_segment_driver():
    valid_doc = jina_pb2.Document()
    valid_doc.id = 1
    valid_doc.text = 'valid'
    valid_doc.length = 2
    valid_doc.mime_type = 'image/png'

    driver = SimpleSegmentDriver(first_chunk_id=3)
    executor = MockSegmenter()
    driver.attach(executor=executor, pea=None)
    driver._apply(valid_doc)

    assert valid_doc.length == 2

    assert valid_doc.chunks[0].id == 3
    assert valid_doc.chunks[0].parent_id == valid_doc.id
    assert valid_doc.chunks[0].blob == array2pb(np.array([0.0, 0.0, 0.0]))
    assert valid_doc.chunks[0].weight == 0
    assert valid_doc.chunks[0].length == 3
    assert valid_doc.chunks[0].mime_type == 'text/plain'

    assert valid_doc.chunks[1].id == 4
    assert valid_doc.chunks[1].parent_id == valid_doc.id
    assert valid_doc.chunks[1].blob == array2pb(np.array([1.0, 1.0, 1.0]))
    assert valid_doc.chunks[1].weight == 1
    assert valid_doc.chunks[1].length == 3
    assert valid_doc.chunks[1].mime_type == 'image/png'

    assert valid_doc.chunks[2].id == 5
    assert valid_doc.chunks[2].parent_id == valid_doc.id
    assert valid_doc.chunks[2].blob == array2pb(np.array([2.0, 2.0, 2.0]))
    assert valid_doc.chunks[2].weight == 2
    assert valid_doc.chunks[2].length == 3
    assert valid_doc.chunks[2].mime_type == 'image/png'
Ejemplo n.º 2
0
 def index_documents():
     """Index Documents:
         doc: tag__id = 0
              tag__dummy_score = 0
              embedding = 0
         doc: tag__id = 1
              tag__dummy_score = -1
              embedding = 1
         doc: tag__id = 2
              tag__dummy_score = -2
              embedding = 2
     """
     doc0 = jina_pb2.Document()
     doc0.tags['id'] = '0'
     doc0.tags['dummy_score'] = 0
     doc0.embedding.CopyFrom(array2pb(np.array([0])))
     doc1 = jina_pb2.Document()
     doc1.tags['id'] = '1'
     doc1.tags['dummy_score'] = -1
     doc1.embedding.CopyFrom(array2pb(np.array([1])))
     doc2 = jina_pb2.Document()
     doc2.tags['id'] = '2'
     doc2.tags['dummy_score'] = -2
     doc2.embedding.CopyFrom(array2pb(np.array([2])))
     return [doc0, doc1, doc2]
Ejemplo n.º 3
0
    def test_segment_driver(self):
        docs = create_documents_to_segment()
        driver = SimpleSegmentDriver(first_chunk_id=3)
        executor = MockSegmenter()
        driver.attach(executor=executor, pea=None)
        driver._apply(docs[0])

        assert docs[0].length == 2
        assert docs[1].length == 2

        assert docs[0].chunks[0].id == 3
        assert docs[0].chunks[0].parent_id == docs[0].id
        assert docs[0].chunks[0].blob == array2pb(np.array([0.0, 0.0, 0.0]))
        assert docs[0].chunks[0].weight == 0
        assert docs[0].chunks[0].length == 3

        assert docs[0].chunks[1].id == 4
        assert docs[0].chunks[1].parent_id == docs[0].id
        assert docs[0].chunks[1].blob == array2pb(np.array([1.0, 1.0, 1.0]))
        assert docs[0].chunks[1].weight == 1
        assert docs[0].chunks[1].length == 3

        assert docs[0].chunks[2].id == 5
        assert docs[0].chunks[2].parent_id == docs[0].id
        assert docs[0].chunks[2].blob == array2pb(np.array([2.0, 2.0, 2.0]))
        assert docs[0].chunks[2].weight == 2
        assert docs[0].chunks[2].length == 3

        with self.assertRaises(AttributeError) as error:
            driver._apply(docs[1])
        assert error.exception.__str__(
        ) == '\'Document\' object has no attribute \'non_existing_key\''
Ejemplo n.º 4
0
def multimodal_all_types_documents():
    docs = []
    for idx in range(0, NUM_DOCS):
        """
        doc - idx
            |
            | - chunk - embedding [idx, idx] - modality1
            | - chunk - blob [idx, idx, idx] - modality2
            | - chunk - text 'modality3' - modality3 -> Inside multimodal encoder will be encoded into [3, 3]
            | - chunk - buffer b'modality4' - modality4 -> Inside multimodal encoder will be encoded into [4, 4]
        Result:
            doc - idx - embedding [idx, idx, idx, idx, idx, 3, 3, 4, 4]
        """
        doc = jina_pb2.Document()
        doc.text = f'{idx}'

        for modality in ['modality1', 'modality2', 'modality3', 'modality4']:
            chunk = doc.chunks.add()
            chunk.modality = modality
            if modality == 'modality1':
                chunk.embedding.CopyFrom(array2pb(np.array([idx, idx])))
            elif modality == 'modality2':
                chunk.blob.CopyFrom(array2pb(np.array([idx, idx, idx])))
            elif modality == 'modality3':
                chunk.text = 'modality3'
            elif modality == 'modality4':
                chunk.buffer = 'modality4'.encode()
        docs.append(doc)
    return docs
def ground_truth_pairs():
    num_docs = 10
    pairs = []
    for idx in range(num_docs):
        doc = jina_pb2.Document()
        gt = jina_pb2.Document()
        doc.embedding.CopyFrom(array2pb(np.array([1, 1])))
        gt.embedding.CopyFrom(array2pb(np.array([2, 2])))
        pairs.append(DocGroundtruthPair(doc=doc, groundtruth=gt))
    return pairs
Ejemplo n.º 6
0
def input_fn():
    d = Document()
    d.mime_type = 'text/plain'
    c = d.chunks.add()
    c.blob.CopyFrom(array2pb(np.random.random(7)))
    yield d
    d = Document()
    d.mime_type = 'image/png'
    c = d.chunks.add()
    c.blob.CopyFrom(array2pb(np.random.random(5)))
    yield d
Ejemplo n.º 7
0
def input_fn():
    doc1 = Document()
    doc1.embedding.CopyFrom(array2pb(e1))
    c = doc1.chunks.add()
    c.embedding.CopyFrom(array2pb(e2))
    c.id = uid.new_doc_id(c)
    doc2 = Document()
    doc2.embedding.CopyFrom(array2pb(e3))
    d = doc2.chunks.add()
    d.id = uid.new_doc_id(d)
    d.embedding.CopyFrom(array2pb(e4))
    return [doc1, doc2]
Ejemplo n.º 8
0
 def input_doc():
     doc = jina_pb2.Document()
     match0 = doc.matches.add()
     match0.id = 10
     match0.text = text
     match0.embedding.CopyFrom(array2pb(random_np_array))
     match1 = doc.matches.add()
     match1.id = 20
     match1.blob.CopyFrom(array2pb(random_np_array))
     match2 = doc.matches.add()
     match2.id = 30
     match2.buffer = buffer
     return doc
def eval_request():
    num_docs = 10
    req = jina_pb2.Request.IndexRequest()
    for idx in range(num_docs):
        doc = req.docs.add()
        gt = req.groundtruths.add()
        chunk_doc = doc.chunks.add()
        chunk_gt = gt.chunks.add()
        chunk_doc.granularity = 1
        chunk_gt.granularity = 1
        chunk_doc.embedding.CopyFrom(array2pb(np.array([1, 1])))
        chunk_gt.embedding.CopyFrom(array2pb(np.array([2, 2])))
    return req
Ejemplo n.º 10
0
def input_fn():
    doc1 = Document()
    doc1.id = 1
    doc1.embedding.CopyFrom(array2pb(e1))
    c = doc1.chunks.add()
    c.id = 3
    c.embedding.CopyFrom(array2pb(e2))
    doc2 = Document()
    doc2.id = 2
    doc2.embedding.CopyFrom(array2pb(e3))
    d = doc2.chunks.add()
    d.id = 4
    d.embedding.CopyFrom(array2pb(e4))
    return [doc1, doc2]
Ejemplo n.º 11
0
 def input_doc():
     doc = jina_pb2.Document()
     doc.id = 1
     chunk0 = doc.chunks.add()
     chunk0.id = 10
     chunk0.text = text
     chunk0.embedding.CopyFrom(array2pb(random_np_array))
     chunk1 = doc.chunks.add()
     chunk1.id = 20
     chunk1.blob.CopyFrom(array2pb(random_np_array))
     chunk2 = doc.chunks.add()
     chunk2.id = 30
     chunk2.buffer = buffer
     return doc
Ejemplo n.º 12
0
def input_fn():
    doc1 = Document()
    doc1.id = 1
    doc1.embedding.CopyFrom(array2pb(np.random.random([7])))
    c = doc1.chunks.add()
    c.id = 3
    c.embedding.CopyFrom(array2pb(np.random.random([5])))
    doc2 = Document()
    doc2.id = 2
    doc2.embedding.CopyFrom(array2pb(np.random.random([3])))
    d = doc2.chunks.add()
    d.id = 4
    d.embedding.CopyFrom(array2pb(np.random.random([9])))
    return [doc1, doc2]
Ejemplo n.º 13
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     doc1 = jina_pb2.Document()
     doc1.id = '1'
     doc1.embedding.CopyFrom(array2pb(np.array([int(doc1.id)])))
     doc2 = jina_pb2.Document()
     doc2.id = '2'
     doc2.embedding.CopyFrom(array2pb(np.array([int(doc2.id)])))
     doc3 = jina_pb2.Document()
     doc3.id = '3'
     doc3.embedding.CopyFrom(array2pb(np.array([int(doc3.id)])))
     doc4 = jina_pb2.Document()
     doc4.id = '4'
     doc4.embedding.CopyFrom(array2pb(np.array([int(doc4.id)])))
     self.db = {1: doc1, 2: doc2, 3: doc3, 4: doc4}
Ejemplo n.º 14
0
def doc_with_multimodal_chunks_wrong(embeddings):
    doc = jina_pb2.Document()
    chunk1 = doc.chunks.add()
    chunk2 = doc.chunks.add()
    chunk3 = doc.chunks.add()
    chunk1.modality = 'visual'
    chunk2.modality = 'visual'
    chunk3.modality = 'textual'
    chunk1.id = uid.new_doc_id(chunk1)
    chunk2.id = uid.new_doc_id(chunk2)
    chunk3.id = uid.new_doc_id(chunk3)
    chunk1.embedding.CopyFrom(array2pb(embeddings[0]))
    chunk2.embedding.CopyFrom(array2pb(embeddings[1]))
    chunk3.embedding.CopyFrom(array2pb(embeddings[2]))
    return doc
Ejemplo n.º 15
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     doc1 = jina_pb2.Document()
     doc1.id = 1
     doc1.embedding.CopyFrom(array2pb(np.array([doc1.id])))
     doc2 = jina_pb2.Document()
     doc2.id = 2
     doc2.embedding.CopyFrom(array2pb(np.array([doc2.id])))
     doc3 = jina_pb2.Document()
     doc3.id = 3
     doc3.embedding.CopyFrom(array2pb(np.array([doc3.id])))
     doc4 = jina_pb2.Document()
     doc4.id = 4
     doc4.embedding.CopyFrom(array2pb(np.array([doc4.id])))
     self.db = {1: doc1, 2: doc2, 3: doc3, 4: doc4}
Ejemplo n.º 16
0
def random_docs(num_docs, chunks_per_doc=5, embed_dim=10):
    c_id = 0
    for j in range(num_docs):
        d = jina_pb2.Document()
        d.id = j
        d.text = b'hello world'
        d.embedding.CopyFrom(array2pb(np.random.random([embed_dim])))
        for k in range(chunks_per_doc):
            c = d.chunks.add()
            c.text = 'i\'m chunk %d from doc %d' % (c_id, j)
            c.embedding.CopyFrom(array2pb(np.random.random([embed_dim])))
            c.id = c_id
            c.parent_id = j
            c_id += 1
        yield d
Ejemplo n.º 17
0
def index_generator(num_doc, target):
    for j in range(num_doc):
        label_int = target['index-labels']['data'][j][0]
        d = jina_pb2.Document()
        d.blob.CopyFrom(array2pb((target['index']['data'][j])))
        d.tags.update({'label': get_mapped_label(label_int)})
        yield d
Ejemplo n.º 18
0
def test_queryset_with_struct(random_workspace):
    total_docs = 4
    docs = []
    for doc_id in range(total_docs):
        doc = jina_pb2.Document()
        doc.text = f'I am doc{doc_id}'
        doc.embedding.CopyFrom(array2pb(np.array([doc_id])))
        doc.tags['label'] = f'label{doc_id%2 + 1}'
        docs.append(doc)

    f = (Flow().add(
        uses=
        '- !FilterQL | {lookups: {tags__label__in: [label1, label2]}, traversal_paths: [r]}'
    ))

    def validate_all_docs(resp):
        assert len(resp.docs) == total_docs

    def validate_label2_docs(resp):
        assert len(resp.docs) == total_docs / 2

    with f:
        # keep all the docs
        f.index(docs, output_fn=validate_all_docs, callback_on_body=True)

        # keep only the docs with label2
        qs = jina_pb2.QueryLang(name='FilterQL', priority=1)
        qs.parameters['lookups'] = {'tags__label': 'label2'}
        qs.parameters['traversal_paths'] = ['r']
        f.index(docs,
                queryset=qs,
                output_fn=validate_label2_docs,
                callback_on_body=True)
Ejemplo n.º 19
0
def random_docs(num_docs):
    vecs = np.random.random([num_docs, 2])
    for j in range(num_docs):
        d = jina_pb2.Document()
        d.id = j
        d.embedding.CopyFrom(array2pb(vecs[j]))
        yield d
Ejemplo n.º 20
0
def create_documents_to_encode(num_docs):
    docs = []
    for idx in range(num_docs):
        doc = jina_pb2.Document()
        doc.blob.CopyFrom(array2pb(np.array([idx])))
        docs.append(doc)
    return docs
Ejemplo n.º 21
0
 def docs():
     doc0 = jina_pb2.Document()
     doc0.text = text
     doc0.embedding.CopyFrom(array2pb(random_np_array))
     chunk = doc0.chunks.add()
     chunk.text = text
     doc1 = jina_pb2.Document()
     doc1.blob.CopyFrom(array2pb(random_np_array))
     doc1.embedding.CopyFrom(array2pb(random_np_array))
     doc2 = jina_pb2.Document()
     doc2.buffer = buffer
     doc2.embedding.CopyFrom(array2pb(random_np_array))
     chunk2 = doc2.chunks.add()
     chunk2.text = text
     chunk3 = doc2.chunks.add()
     chunk3.buffer = buffer
     return [doc0, doc1, doc2]
Ejemplo n.º 22
0
def test_array2pb():
    # i don't understand why is this set?
    # os env should be available to that process-context only
    if 'JINA_ARRAY_QUANT' in os.environ:
        print(f'quant is on: {os.environ["JINA_ARRAY_QUANT"]}')
        del os.environ['JINA_ARRAY_QUANT']

    np.testing.assert_almost_equal(pb2array(array2pb(e4)), e4)
Ejemplo n.º 23
0
def query_generator(num_doc, target):
    for j in range(num_doc):
        n = random.randint(0, 10000) #there are 10000 query examples, so that's the limit
        d = jina_pb2.Document()
        label_int = targets['query-labels']['data'][n][0]
        d.blob.CopyFrom(array2pb(target['query']['data'][n]))
        d.tags.update({'label': get_mapped_label(label_int)})
        yield d
Ejemplo n.º 24
0
def random_docs(num_docs, embed_dim=10, jitter=1):
    for j in range(num_docs):
        d = jina_pb2.Document()
        d.tags['id'] = j
        d.text = b'hello'
        d.embedding.CopyFrom(
            array2pb(
                np.random.random([embed_dim + np.random.randint(0, jitter)])))
        yield d
Ejemplo n.º 25
0
 def create(self):
     gt = jina_pb2.Document()
     if field_type == 'text':
         gt.text = 'aaaa'
     elif field_type == 'buffer':
         gt.buffer = b'\x01\x02\x03\04'
     elif field_type == 'blob':
         gt.blob.CopyFrom(array2pb(np.array([1, 1, 1, 1])))
     return gt
Ejemplo n.º 26
0
    def doc_groundtruth_evaluation_pairs():
        doc0 = jina_pb2.Document()
        doc0.embedding.CopyFrom(array2pb(np.array(
            [0])))  # it will match 0 and 1
        groundtruth0 = jina_pb2.Document()
        match0 = groundtruth0.matches.add()
        match0.tags['id'] = '0'
        match1 = groundtruth0.matches.add()
        match1.tags['id'] = '2'
        # top_k is set to 2 for VectorSearchDriver
        # expects as matches [0, 2] but given [0, 1]
        # Precision@1 = 100%
        # Precision@2 = 50%
        # Recall@1 = 100%
        # Recall@2 = 50%

        # expects as ranked [0, 2] but given [0, 1]
        # Precision@1 = 100%
        # Precision@2 = 50%
        # Recall@1 = 100%
        # Recall@2 = 50%

        doc1 = jina_pb2.Document()
        doc1.embedding.CopyFrom(array2pb(np.array(
            [2])))  # it will match 2 and 1
        groundtruth1 = jina_pb2.Document()
        match0 = groundtruth1.matches.add()
        match0.tags['id'] = '1'
        match1 = groundtruth1.matches.add()
        match1.tags['id'] = '2'
        # expects as matches [1, 2] but given [2, 1]
        # Precision@1 = 100%
        # Precision@2 = 100%
        # Recall@1 = 100%
        # Recall@2 = 100%

        # expects as ranked [1, 2] but given [2, 1]
        # Precision@1 = 100%
        # Precision@2 = 100%
        # Recall@1 = 100%
        # Recall@2 = 100%

        return [(doc0, groundtruth0), (doc1, groundtruth1)]
Ejemplo n.º 27
0
 def test_craft_driver(self):
     docs = create_documents_to_craft()
     driver = SimpleCraftDriver()
     executor = MockCrafter()
     driver.attach(executor=executor, pea=None)
     driver._apply(docs[0])
     assert docs[0].blob == array2pb(np.array([0.0, 0.0, 0.0]))
     assert docs[0].weight == 10
     with self.assertRaises(AttributeError) as error:
         driver._apply(docs[1])
     assert error.exception.__str__() == '\'Document\' object has no attribute \'non_existing_key\''
Ejemplo n.º 28
0
def get_duplicate_docs(num_docs=10):
    result = []
    unique_set = set()
    for idx in range(num_docs):
        doc = jina_pb2.Document()
        content = int(idx / 2)
        doc.embedding.CopyFrom(array2pb(np.array([content])))
        doc.text = f'I am doc{content}'
        result.append(doc)
        unique_set.add(content)
    return result, len(unique_set)
Ejemplo n.º 29
0
 def request(field_type):
     num_docs = 10
     req = jina_pb2.Request.IndexRequest()
     for idx in range(num_docs):
         doc = req.docs.add()
         gt = req.groundtruths.add()
         chunk_doc = doc.chunks.add()
         chunk_gt = gt.chunks.add()
         chunk_doc.granularity = 1
         chunk_gt.granularity = 1
         if field_type == 'text':
             chunk_doc.text = 'aaa'
             chunk_gt.text = 'aaaa'
         elif field_type == 'buffer':
             chunk_doc.buffer = b'\x01\x02\x03'
             chunk_gt.buffer = b'\x01\x02\x03\x04'
         elif field_type == 'blob':
             chunk_doc.blob.CopyFrom(array2pb(np.array([1, 1, 1])))
             chunk_gt.blob.CopyFrom(array2pb(np.array([1, 1, 1, 1])))
     return req
Ejemplo n.º 30
0
def random_docs(num_docs, chunks_per_doc=5, embed_dim=10):
    c_id = 0
    for j in range(num_docs):
        d = jina_pb2.Document()
        for k in range(chunks_per_doc):
            c = d.chunks.add()
            c.embedding.CopyFrom(array2pb(np.random.random([embed_dim])))
            c.chunk_id = c_id
            c.doc_id = j
            c_id += 1
        yield d