Example #1
0
 def index_documents():
     """Index Documents:
         doc: tag__id = 0
              tag__dummy_score = 0
              embedding = 0
         doc: tag__id = 1
              tag__dummy_score = -1
              embedding = 1
         doc: tag__id = 2
              tag__dummy_score = -2
              embedding = 2
     """
     doc0 = jina_pb2.Document()
     doc0.tags['id'] = '0'
     doc0.tags['dummy_score'] = 0
     doc0.embedding.CopyFrom(array2pb(np.array([0])))
     doc1 = jina_pb2.Document()
     doc1.tags['id'] = '1'
     doc1.tags['dummy_score'] = -1
     doc1.embedding.CopyFrom(array2pb(np.array([1])))
     doc2 = jina_pb2.Document()
     doc2.tags['id'] = '2'
     doc2.tags['dummy_score'] = -2
     doc2.embedding.CopyFrom(array2pb(np.array([2])))
     return [doc0, doc1, doc2]
Example #2
0
 def index_documents():
     """Index Documents:
         doc: tag__id = 0
              tag__dummy_score = 0
              embedding = 0
         doc: tag__id = 1
              tag__dummy_score = -1
              embedding = 1
         doc: tag__id = 2
              tag__dummy_score = -2
              embedding = 2
     """
     doc0 = jina_pb2.Document()
     doc0.tags['id'] = '0'
     doc0.tags['dummy_score'] = 0
     GenericNdArray(doc0.embedding).value = np.array([0])
     doc1 = jina_pb2.Document()
     doc1.tags['id'] = '1'
     doc1.tags['dummy_score'] = -1
     GenericNdArray(doc1.embedding).value = np.array([1])
     doc2 = jina_pb2.Document()
     doc2.tags['id'] = '2'
     doc2.tags['dummy_score'] = -2
     GenericNdArray(doc2.embedding).value = np.array([2])
     return [doc0, doc1, doc2]
Example #3
0
def input_index_data(num_docs=None, batch_size=8, dataset='f30k'):
    from dataset import get_data_loader
    captions = 'dataset_flickr30k.json' if dataset == 'f30k' else 'captions.txt'
    data_loader = get_data_loader(root=os.path.join(cur_dir,
                                                    f'data/{dataset}/images'),
                                  captions=os.path.join(
                                      cur_dir, f'data/{dataset}/{captions}'),
                                  split='test',
                                  batch_size=batch_size,
                                  dataset_type=dataset)
    for i, (images, captions) in enumerate(data_loader):
        for image in images:
            document = jina_pb2.Document()
            document.buffer = image
            document.modality = 'image'
            document.mime_type = 'image/jpeg'
            yield document

        for caption in captions:
            document = jina_pb2.Document()
            document.text = caption
            document.modality = 'text'
            document.mime_type = 'plain/text'
            yield document

        if num_docs and (i + 1) * batch_size >= num_docs:
            break
Example #4
0
def test_docgroundtruth_pair():
    def add_matches(doc: jina_pb2.Document, num_matches):
        for idx in range(num_matches):
            match = doc.matches.add()
            match.adjacency = doc.adjacency + 1

    def add_chunks(doc: jina_pb2.Document, num_chunks):
        for idx in range(num_chunks):
            chunk = doc.chunks.add()
            chunk.granularity = doc.granularity + 1

    doc = jina_pb2.Document()
    gt = jina_pb2.Document()
    add_matches(doc, 3)
    add_matches(gt, 3)
    add_chunks(doc, 3)
    add_chunks(gt, 3)

    pair = DocGroundtruthPair(doc, gt)

    j = 0
    for chunk_pair in pair.chunks:
        assert chunk_pair.doc.granularity == 1
        assert chunk_pair.groundtruth.granularity == 1
        j += 1

    k = 0
    for match_pair in pair.matches:
        assert match_pair.doc.adjacency == 1
        assert match_pair.groundtruth.adjacency == 1
        k += 1

    assert j == 3
    assert k == 3
Example #5
0
def test_flow_with_modalities(tmpdir):
    os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE'] = str(tmpdir)

    def input_fn():
        doc1 = jina_pb2.Document()
        doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
        doc1.id = uid.new_doc_id(doc1)

        doc2 = jina_pb2.Document()
        doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
        doc2.id = uid.new_doc_id(doc2)

        doc3 = jina_pb2.Document()
        doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
        doc3.id = uid.new_doc_id(doc3)

        return [doc1, doc2, doc3]

    flow = Flow().add(name='crafter', uses='!MockSegmenter'). \
        add(name='encoder1', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode1.yml')). \
        add(name='indexer1', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-1.yml'), needs=['encoder1']). \
        add(name='encoder2', uses=os.path.join(cur_dir, 'yaml/mockencoder-mode2.yml'), needs=['crafter']). \
        add(name='indexer2', uses=os.path.join(cur_dir, 'yaml/numpy-indexer-2.yml')). \
        join(['indexer1', 'indexer2'])

    with flow:
        flow.index(input_fn=input_fn, override_doc_id=False)

    with open(tmpdir.join('vec1.gz'), 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(
            result,
            np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]))

    with open(tmpdir.join('vec2.gz'), 'rb') as fp:
        result = np.frombuffer(fp.read(), dtype='float').reshape([-1, 3])
        np.testing.assert_equal(
            result,
            np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]))

    chunkIndexer1 = BinaryPbIndexer.load(tmpdir.join('kvidx1.bin'))
    assert chunkIndexer1.size == 3
    d_id = list(chunkIndexer1.query_handler.header.keys())[0]

    query_doc = jina_pb2.Document()
    query_doc.ParseFromString(chunkIndexer1.query(d_id))
    assert query_doc.text == 'title: this is mode1 from doc1'
    assert query_doc.modality == 'mode1'

    chunkIndexer2 = BinaryPbIndexer.load(tmpdir.join('kvidx2.bin'))
    assert chunkIndexer2.size == 3
    d_id = list(chunkIndexer2.query_handler.header.keys())[0]

    query_doc = jina_pb2.Document()
    query_doc.ParseFromString(chunkIndexer2.query(d_id))
    assert query_doc.text == ' body: this is mode2 from doc1'
    assert query_doc.modality == 'mode2'

    del os.environ['JINA_TEST_FLOW_MULTIMODE_WORKSPACE']
Example #6
0
def create_documents_to_craft():
    doc1 = jina_pb2.Document()
    doc1.id = 1
    doc1.text = 'valid'
    doc2 = jina_pb2.Document()
    doc2.id = 2
    doc2.text = 'invalid'
    return [doc1, doc2]
Example #7
0
def random_docs_to_chunk():
    d1 = jina_pb2.Document()
    d1.id = 1
    d1.text = 'chunk1 chunk2'
    yield d1
    d2 = jina_pb2.Document()
    d2.id = 1
    d2.text = 'chunk3'
    yield d2
Example #8
0
def random_docs_with_chunks(num_docs):
    d1 = jina_pb2.Document()
    d1.id = 1
    d1.text = 'chunk1 chunk2'
    yield d1
    d2 = jina_pb2.Document()
    d2.id = 1
    d2.text = 'chunk3'
    yield d2
def ground_truth_pairs():
    num_docs = 10
    pairs = []
    for idx in range(num_docs):
        doc = jina_pb2.Document()
        gt = jina_pb2.Document()
        doc.embedding.CopyFrom(array2pb(np.array([1, 1])))
        gt.embedding.CopyFrom(array2pb(np.array([2, 2])))
        pairs.append(DocGroundtruthPair(doc=doc, groundtruth=gt))
    return pairs
Example #10
0
def create_documents_to_segment():
    doc1 = jina_pb2.Document()
    doc1.id = 1
    doc1.text = 'valid'
    doc1.length = 2
    doc2 = jina_pb2.Document()
    doc2.id = 2
    doc2.text = 'invalid'
    doc2.length = 2
    return [doc1, doc2]
Example #11
0
def ground_truth_pairs():
    num_docs = 10
    pairs = []
    for idx in range(num_docs):
        doc = jina_pb2.Document()
        gt = jina_pb2.Document()
        GenericNdArray(doc.embedding).value = np.array([1, 1])
        GenericNdArray(gt.embedding).value = np.array([2, 2])
        pairs.append(DocGroundtruthPair(doc=doc, groundtruth=gt))
    return pairs
Example #12
0
def random_docs_with_tags():
    d1 = jina_pb2.Document()
    d1.id = 1
    d1.text = 'a'
    d1.tags.update({'id': 1})
    yield d1
    d2 = jina_pb2.Document()
    d2.id = 2
    d2.tags.update({'id': 2})
    d2.text = 'b'
    yield d2
Example #13
0
    def create_document_ground_truth_pairs(num_docs):
        def add_matches(doc: jina_pb2.Document, num_matches):
            for idx in range(num_matches):
                match = doc.matches.add()
                match.tags['id'] = idx

        pairs = []
        for idx in range(num_docs):
            doc = jina_pb2.Document()
            gt = jina_pb2.Document()
            add_matches(doc, num_docs)
            add_matches(gt, num_docs)
            pairs.append(DocGroundtruthPair(doc=doc, groundtruth=gt))
        return pairs
Example #14
0
    def input_fn():
        doc1 = jina_pb2.Document()
        doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
        doc1.id = uid.new_doc_id(doc1)

        doc2 = jina_pb2.Document()
        doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
        doc2.id = uid.new_doc_id(doc2)

        doc3 = jina_pb2.Document()
        doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
        doc3.id = uid.new_doc_id(doc3)

        return [doc1, doc2, doc3]
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     doc1 = jina_pb2.Document()
     doc1.id = '1'
     doc1.embedding.CopyFrom(array2pb(np.array([int(doc1.id)])))
     doc2 = jina_pb2.Document()
     doc2.id = '2'
     doc2.embedding.CopyFrom(array2pb(np.array([int(doc2.id)])))
     doc3 = jina_pb2.Document()
     doc3.id = '3'
     doc3.embedding.CopyFrom(array2pb(np.array([int(doc3.id)])))
     doc4 = jina_pb2.Document()
     doc4.id = '4'
     doc4.embedding.CopyFrom(array2pb(np.array([int(doc4.id)])))
     self.db = {1: doc1, 2: doc2, 3: doc3, 4: doc4}
Example #16
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     doc1 = jina_pb2.Document()
     doc1.id = 1
     doc1.embedding.CopyFrom(array2pb(np.array([doc1.id])))
     doc2 = jina_pb2.Document()
     doc2.id = 2
     doc2.embedding.CopyFrom(array2pb(np.array([doc2.id])))
     doc3 = jina_pb2.Document()
     doc3.id = 3
     doc3.embedding.CopyFrom(array2pb(np.array([doc3.id])))
     doc4 = jina_pb2.Document()
     doc4.id = 4
     doc4.embedding.CopyFrom(array2pb(np.array([doc4.id])))
     self.db = {1: doc1, 2: doc2, 3: doc3, 4: doc4}
Example #17
0
def test_segment_driver():
    valid_doc = jina_pb2.Document()
    valid_doc.id = 1
    valid_doc.text = 'valid'
    valid_doc.length = 2
    valid_doc.mime_type = 'image/png'

    driver = SimpleSegmentDriver(first_chunk_id=3)
    executor = MockSegmenter()
    driver.attach(executor=executor, pea=None)
    driver._apply(valid_doc)

    assert valid_doc.length == 2

    assert valid_doc.chunks[0].id == 3
    assert valid_doc.chunks[0].parent_id == valid_doc.id
    assert valid_doc.chunks[0].blob == array2pb(np.array([0.0, 0.0, 0.0]))
    assert valid_doc.chunks[0].weight == 0
    assert valid_doc.chunks[0].length == 3
    assert valid_doc.chunks[0].mime_type == 'text/plain'

    assert valid_doc.chunks[1].id == 4
    assert valid_doc.chunks[1].parent_id == valid_doc.id
    assert valid_doc.chunks[1].blob == array2pb(np.array([1.0, 1.0, 1.0]))
    assert valid_doc.chunks[1].weight == 1
    assert valid_doc.chunks[1].length == 3
    assert valid_doc.chunks[1].mime_type == 'image/png'

    assert valid_doc.chunks[2].id == 5
    assert valid_doc.chunks[2].parent_id == valid_doc.id
    assert valid_doc.chunks[2].blob == array2pb(np.array([2.0, 2.0, 2.0]))
    assert valid_doc.chunks[2].weight == 2
    assert valid_doc.chunks[2].length == 3
    assert valid_doc.chunks[2].mime_type == 'image/png'
Example #18
0
def random_docs_with_matches(num_docs):
    docs = []
    #matches are always in the same level depth as its match
    for j in range(num_docs):
        d = jina_pb2.Document()
        d.level_depth = 0
        d.id = j
        d.text = 'hello world'
        d.uri = 'doc://'
        for m in range(10):
            dm = d.matches.add()
            dm.text = 'match to hello world'
            dm.level_depth = 0
            dm.uri = 'doc://match'
            dm.id = m
            dm.score.ref_id = d.id
            for mm in range(10):
                dmm = dm.matches.add()
                dmm.text = 'nested match to match'
                dmm.uri = 'doc://match/match'
                dmm.id = mm
                dmm.score.ref_id = m
                dmm.level_depth = 0
        docs.append(d)
    return docs
def multimodal_all_types_documents():
    docs = []
    for idx in range(0, NUM_DOCS):
        """
        doc - idx
            |
            | - chunk - embedding [idx, idx] - modality1
            | - chunk - blob [idx, idx, idx] - modality2
            | - chunk - text 'modality3' - modality3 -> Inside multimodal encoder will be encoded into [3, 3]
            | - chunk - buffer b'modality4' - modality4 -> Inside multimodal encoder will be encoded into [4, 4]
        Result:
            doc - idx - embedding [idx, idx, idx, idx, idx, 3, 3, 4, 4]
        """
        doc = jina_pb2.Document()
        doc.text = f'{idx}'

        for modality in ['modality1', 'modality2', 'modality3', 'modality4']:
            chunk = doc.chunks.add()
            chunk.modality = modality
            if modality == 'modality1':
                GenericNdArray(chunk.embedding).value = np.array([idx, idx])
            elif modality == 'modality2':
                GenericNdArray(chunk.blob).value = np.array([idx, idx, idx])
            elif modality == 'modality3':
                chunk.text = 'modality3'
            elif modality == 'modality4':
                chunk.buffer = 'modality4'.encode()
        docs.append(doc)
    return docs
Example #20
0
def random_queries(num_docs, chunks_per_doc=5, embed_dim=10):
    for j in range(num_docs):
        d = jina_pb2.Document()
        for k in range(chunks_per_doc):
            dd = d.topk_results.add()
            dd.match_doc.doc_id = k
        yield d
Example #21
0
def index_generator(num_doc, target):
    for j in range(num_doc):
        label_int = target['index-labels']['data'][j][0]
        d = jina_pb2.Document()
        d.blob.CopyFrom(array2pb((target['index']['data'][j])))
        d.tags.update({'label': get_mapped_label(label_int)})
        yield d
Example #22
0
def test_queryset_with_struct(random_workspace):
    total_docs = 4
    docs = []
    for doc_id in range(total_docs):
        doc = jina_pb2.Document()
        doc.text = f'I am doc{doc_id}'
        doc.embedding.CopyFrom(array2pb(np.array([doc_id])))
        doc.tags['label'] = f'label{doc_id%2 + 1}'
        docs.append(doc)

    f = (Flow().add(
        uses=
        '- !FilterQL | {lookups: {tags__label__in: [label1, label2]}, traversal_paths: [r]}'
    ))

    def validate_all_docs(resp):
        assert len(resp.docs) == total_docs

    def validate_label2_docs(resp):
        assert len(resp.docs) == total_docs / 2

    with f:
        # keep all the docs
        f.index(docs, output_fn=validate_all_docs, callback_on_body=True)

        # keep only the docs with label2
        qs = jina_pb2.QueryLang(name='FilterQL', priority=1)
        qs.parameters['lookups'] = {'tags__label': 'label2'}
        qs.parameters['traversal_paths'] = ['r']
        f.index(docs,
                queryset=qs,
                output_fn=validate_label2_docs,
                callback_on_body=True)
Example #23
0
def create_chunk_matches_to_score():
    # doc: (id: 100, level_depth=0)
    # |- chunks: (id: 10)
    # |  |- matches: (id: 11, parent_id: 1, score.value: 2, level_depth=1),
    # |  |- matches: (id: 12, parent_id: 1, score.value: 3, level_depth=1),
    # |- chunks: (id: 20)
    #    |- matches: (id: 21, parent_id: 2, score.value: 4, level_depth=1),
    #    |- matches: (id: 22, parent_id: 2, score.value: 5, level_depth=1)
    doc = jina_pb2.Document()
    doc.id = 100
    doc.level_depth = 0
    num_matches = 2
    for parent_id in range(1, 3):
        chunk = doc.chunks.add()
        chunk.id = parent_id * 10
        chunk.level_depth = doc.level_depth + 1
        for score_value in range(parent_id * 2, parent_id * 2 + num_matches):
            match = chunk.matches.add()
            match.level_depth = chunk.level_depth
            match.parent_id = parent_id
            match.score.value = score_value
            match.score.ref_id = chunk.id
            match.id = 10 * parent_id + score_value
            match.length = 4
    return doc
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     doc1 = jina_pb2.Document()
     doc1.id = '01'
     doc1.tags['groundtruth'] = True
     doc2 = jina_pb2.Document()
     doc2.id = '02'
     doc2.tags['groundtruth'] = True
     doc4 = jina_pb2.Document()
     doc4.id = '04'
     doc4.tags['groundtruth'] = True
     self.db = {
         uid.id2hash(doc1.id): doc1.SerializeToString(),
         uid.id2hash(doc2.id): doc2.SerializeToString(),
         uid.id2hash(doc4.id): doc4.SerializeToString()
     }
Example #25
0
def create_chunk_chunk_matches_to_score():
    # doc: (id: 100, granularity=0)
    # |- chunk: (id: 101, granularity=1)
    #       |- chunks: (id: 10)
    #       |   |- matches: (id: 11, parent_id: 1, score.value: 2),
    #       |   |- matches: (id: 12, parent_id: 1, score.value: 3),
    #       |- chunks: (id: 20)
    #           |- matches: (id: 21, parent_id: 2, score.value: 4),
    #           |- matches: (id: 22, parent_id: 2, score.value: 5)
    doc = jina_pb2.Document()
    doc.id = 100
    doc.granularity = 0
    chunk = doc.chunks.add()
    chunk.id = 101
    chunk.granularity = doc.granularity + 1
    num_matches = 2
    for parent_id in range(1, 3):
        chunk_chunk = chunk.chunks.add()
        chunk_chunk.id = parent_id * 10
        chunk_chunk.granularity = chunk.granularity + 1
        for score_value in range(parent_id * 2, parent_id * 2 + num_matches):
            match = chunk_chunk.matches.add()
            match.parent_id = parent_id
            match.score.value = score_value
            match.score.ref_id = chunk_chunk.id
            match.id = 10 * parent_id + score_value
            match.length = 4
    return doc
Example #26
0
def create_documents_to_encode(num_docs):
    docs = []
    for idx in range(num_docs):
        doc = jina_pb2.Document()
        doc.blob.CopyFrom(array2pb(np.array([idx])))
        docs.append(doc)
    return docs
Example #27
0
def random_docs(num_docs):
    vecs = np.random.random([num_docs, 2])
    for j in range(num_docs):
        d = jina_pb2.Document()
        d.id = j
        d.embedding.CopyFrom(array2pb(vecs[j]))
        yield d
Example #28
0
def random_queries(num_docs, chunks_per_doc=5):
    for j in range(num_docs):
        d = jina_pb2.Document()
        for k in range(chunks_per_doc):
            dd = d.add()
            dd.id = k + 1  # 1-indexed
        yield d
Example #29
0
def search_generator(path: str, buffer: bytes):
    d = jina_pb2.Document()
    if buffer:
        d.buffer = buffer
    if path:
        d.uri = path
    yield d
Example #30
0
 def _create_Document(self, doc_id, text, weight, length):
     d = jina_pb2.Document()
     d.id = doc_id
     d.buffer = text.encode('utf8')
     d.weight = weight
     d.length = length
     return d