Example #1
0
def create_chunk_matches_to_score():
    # doc: (id: 100, granularity=0)
    # |- chunks: (id: 10)
    # |  |- matches: (id: 11, parent_id: 1, score.value: 2),
    # |  |- matches: (id: 12, parent_id: 1, score.value: 3),
    # |- chunks: (id: 20)
    #    |- matches: (id: 21, parent_id: 2, score.value: 4),
    #    |- matches: (id: 22, parent_id: 2, score.value: 5)
    doc = Document()
    doc.id = '1'
    doc.granularity = 0
    num_matches = 2
    for parent_id in range(1, 3):
        chunk = Document()
        chunk_id = parent_id * 10
        chunk.id = str(chunk_id)
        chunk.granularity = doc.granularity + 1
        for score_value in range(parent_id * 2, parent_id * 2 + num_matches):
            match = Document()
            match.granularity = chunk.granularity
            match.parent_id = str(parent_id)
            match.score = NamedScore(value=score_value, ref_id=chunk.id)
            match.id = str(10 * int(parent_id) + score_value)
            match.length = 4
            chunk.matches.append(match)
        doc.chunks.append(chunk)
    return doc
Example #2
0
def create_document_to_score():
    # doc: 1
    # |- chunk: 2
    # |  |- matches: (id: 4, parent_id: 40, score.value: 4),
    # |  |- matches: (id: 5, parent_id: 50, score.value: 5),
    # |
    # |- chunk: 3
    #    |- matches: (id: 6, parent_id: 60, score.value: 6),
    #    |- matches: (id: 7, parent_id: 70, score.value: 7)
    doc = Document()
    doc.id = '1'
    for c in range(2):
        chunk = Document()
        chunk_id = str(c + 2)
        chunk.id = chunk_id
        for m in range(2):
            match = Document()
            match_id = 2 * int(chunk_id) + m
            match.id = str(match_id)
            parent_id = 10 * int(match_id)
            match.parent_id = str(parent_id)
            match.length = int(match_id)
            # to be used by MaxRanker and MinRanker
            match.score = NamedScore(value=int(match_id), ref_id=chunk.id)
            match.tags['price'] = match.score.value
            match.tags['discount'] = DISCOUNT_VAL
            chunk.matches.append(match)
        doc.chunks.append(chunk)
    return doc
Example #3
0
def random_queries(num_docs, chunks_per_doc=5):
    for j in range(num_docs):
        d = Document()
        d.id = j
        for k in range(chunks_per_doc):
            dd = Document()
            dd.id = num_docs + j * chunks_per_doc + k
            d.chunks.add(dd)
        yield d
Example #4
0
def get_docs_to_delete(doc_id_to_chunk_ids):
    for i, (doc_id, chunks) in enumerate(doc_id_to_chunk_ids.items()):
        document = Document()
        document.id = str(f'{i}' * 16)
        for chunk in chunks:
            document.chunks.append(chunk)
        yield document
Example #5
0
def create_document(doc_id, text, weight, length):
    d = Document()
    d.id = str(doc_id)
    d.buffer = text.encode('utf8')
    d.weight = weight
    d.length = length
    return d
Example #6
0
def random_docs(start, end, embed_dim=10):
    for j in range(start, end):
        d = Document()
        d.id = f'{j:0>16}'
        d.tags['id'] = j
        d.text = ''.join(random.choice(string.ascii_lowercase) for _ in range(10)).encode('utf8')
        d.embedding = np.random.random([embed_dim])
        yield d
Example #7
0
def index_docs():
    docs = []
    for idx in range(0, 100):
        doc = Document()
        doc.id = f'{idx:0>16}'
        doc.embedding = doc.embedding = np.array([idx, idx])
        docs.append(doc)
    return docs
Example #8
0
def test_broken_document(segment_driver, text_segmenter_executor):
    segment_driver.attach(executor=text_segmenter_executor, runtime=None)

    invalid_doc = Document()
    invalid_doc.id = 1
    invalid_doc.text = 'invalid'

    with pytest.raises(AttributeError):
        segment_driver._apply_all([DocumentArray([invalid_doc])])
Example #9
0
def random_docs(start, end, embed_dim=10, jitter=1, has_content=True):
    for j in range(start, end):
        d = Document()
        d.id = str(f'{j}' * 16)
        if has_content:
            d.tags['id'] = j
            d.text = ''.join(random.choice(string.ascii_lowercase) for _ in range(10)).encode('utf8')
            d.embedding = np.random.random([embed_dim + np.random.randint(0, jitter)])
        yield d
Example #10
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     doc1 = Document()
     doc1.id = str(1) * 16
     doc1.embedding = np.array([int(doc1.id)])
     doc2 = Document()
     doc2.id = str(2) * 16
     doc2.embedding = np.array([int(doc2.id)])
     doc3 = Document()
     doc3.id = str(3) * 16
     doc3.embedding = np.array([int(doc3.id)])
     doc4 = Document()
     doc4.id = str(4) * 16
     doc4.embedding = np.array([int(doc4.id)])
     self.db = {
         id2hash(doc1.id): doc1.SerializeToString(),
         id2hash(doc2.id): doc2.SerializeToString(),
         id2hash(doc3.id): doc3.SerializeToString(),
         id2hash(doc4.id): doc4.SerializeToString()
     }
Example #11
0
def test_broken_document():
    driver = SimpleSegmentDriver()
    executor = MockSegmenter()
    driver.attach(executor=executor, runtime=None)

    invalid_doc = Document()
    invalid_doc.id = 1
    invalid_doc.text = 'invalid'

    with pytest.raises(AttributeError):
        driver._apply_all([DocumentSet([invalid_doc])])
Example #12
0
def random_docs(start, end):
    documents = []
    for j in range(start, end):
        d = Document()
        d.id = j
        d.tags['id'] = j
        d.text = ''.join(
            random.choice(string.ascii_lowercase)
            for _ in range(10)).encode('utf8')
        d.embedding = np.random.random([10 + np.random.randint(0, 1)])
        documents.append(d)
    return documents
Example #13
0
def test_broken_document():
    driver = SimpleSegmentDriver()
    executor = MockSegmenter()
    driver.attach(executor=executor, pea=None)

    invalid_doc = Document()
    invalid_doc.id = 1
    invalid_doc.text = 'invalid'
    invalid_doc.length = 2

    assert invalid_doc.length == 2

    with pytest.raises(AttributeError):
        driver._apply_all([invalid_doc])
Example #14
0
def create_document_to_search():
    # 1-D embedding
    # doc: 0
    #   - chunk: 1
    #   - chunk: 2
    #   - chunk: 3
    #   - chunk: 4
    #   - chunk: 5 - will be missing from KV indexer
    doc = Document()
    doc.id = '0' * 16
    for c in range(5):
        chunk = doc.add_chunk()
        chunk.id = str(c + 1) * 16
    return doc
def create_document_to_score_same_depth_level():
    # doc: 1
    # |  matches: (id: 2, parent_id: 20, score.value: 30, length: 3),
    # |  matches: (id: 3, parent_id: 20, score.value: 40, length: 4),
    # |  matches: (id: 4, parent_id: 30, score.value: 20, length: 2),
    # |  matches: (id: 5, parent_id: 30, score.value: 10, length: 1),

    doc = Document()
    doc.id = 1

    for match_id, parent_id, match_score, weight in [
        (2, 20, 30, 3),
        (3, 20, 40, 4),
        (4, 30, 20, 2),
        (5, 30, 10, 1),
    ]:
        match = Document()
        match.id = match_id
        match.parent_id = parent_id
        match.weight = weight
        match.score = NamedScore(value=match_score, ref_id=doc.id)
        doc.matches.append(match)
    return doc
Example #16
0
def create_document_to_search():
    # 1-D embedding
    # doc: 1 - chunk: 2 - embedding(2.0)
    #        - chunk: 3 - embedding(3.0)
    #        - chunk: 4 - embedding(4.0)
    #        - chunk: 5 - embedding(5.0)
    # ....
    doc = Document()
    for c in range(10):
        chunk = Document()
        chunk.id = str(c) * 16
        chunk.embedding = np.array([c])
        doc.chunks.append(chunk)
    return doc
Example #17
0
def document():
    # 1-D embedding
    # doc: 0
    #   - chunk: 1
    #   - chunk: 2
    #   - chunk: 3
    #   - chunk: 4
    #   - chunk: 5 - will be missing from KV indexer
    doc = Document()
    doc.id = '0' * 16
    for c in range(5):
        with Document() as chunk:
            chunk.id = str(c + 1) * 16
        doc.chunks.add(chunk)
    return doc
Example #18
0
def create_document_to_search_with_matches_on_chunks():
    # 1-D embedding
    # doc: 0
    #   - chunk: 1
    #     - match: 2
    #     - match: 3
    #     - match: 4
    #     - match: 5 - will be missing from KV indexer
    #     - match: 6 - will be missing from KV indexer
    doc = Document()
    doc.id = '0' * 16
    chunk = doc.add_chunk()
    chunk.id = '1' * 16
    for m in range(5):
        match = chunk.add_match(doc_id=str(m + 2) * 16, score_value=1.)
    return doc
Example #19
0
def create_document_to_score():
    # doc: 1
    # |- matches: (id: 2, parent_id: 1, score.value: 2),
    # |- matches: (id: 3, parent_id: 1, score.value: 3),
    # |- matches: (id: 4, parent_id: 1, score.value: 4),
    # |- matches: (id: 5, parent_id: 1, score.value: 5),
    doc = Document()
    doc.id = '1' * 16
    doc.length = 5
    for match_id, match_score in [(2, 3), (3, 6), (4, 1), (5, 8)]:
        with Document() as match:
            match.id = str(match_id) * 16
            match.length = match_score
            match.score.value = match_score
            doc.matches.append(match)
    return doc
Example #20
0
def create_document_to_search_with_matches_on_chunks():
    # 1-D embedding
    # doc: 0
    #   - chunk: 1
    #     - match: 2
    #     - match: 3
    #     - match: 4
    #     - match: 5 - will be missing from KV indexer
    #     - match: 6 - will be missing from KV indexer
    doc = Document()
    doc.id = '0' * 16
    chunk = doc.chunks.append()
    chunk.id = '1' * 16
    for m in range(5):
        d = Document(id=str(m + 2) * 16)
        d.score.value = 1.
        chunk.matches.append(d)
    return doc
def create_document_to_score():
    # doc: 1
    # |- matches: (id: 2, parent_id: 1, score.value: 2),
    # |- matches: (id: 3, parent_id: 1, score.value: 3),
    # |- matches: (id: 4, parent_id: 1, score.value: 4),
    # |- matches: (id: 5, parent_id: 1, score.value: 5),
    doc = Document()
    doc.id = '1' * 20
    for match_id, match_score, match_length in [
        (2, 3, 16),
        (3, 6, 24),
        (4, 1, 8),
        (5, 8, 16),
    ]:
        with Document() as match:
            match.id = match_id
            match.score = NamedScore(value=match_score, ref_id=doc.id)
            match.weight = match_length
            doc.matches.append(match)
    return doc
Example #22
0
def evaluate_docs():
    """Evaluate Documents:
        doc: id = 00
             tag__groundtruth = False
             text = aaa
        doc: id = 01
             tag__groundtruth = False
             text = aaa
        doc: id = 02
             tag__groundtruth = False
             text = aaa
        ...
    """
    docs = []
    for idx in range(0, 100):
        doc = Document()
        doc.id = f'{idx:0>16}'
        doc.tags['groundtruth'] = False
        doc.text = 'aaa'
        docs.append(doc)
    return docs
Example #23
0
def index_groundtruth():
    """Index Groundtruth:
        doc: id = 00
             tag__groundtruth = True
             text = aa
        doc: id = 01
             tag__groundtruth = True
             text = aa
        doc: id = 02
             tag__groundtruth = True
             text = aa
        ... we will not have groundtruth for id 5, 10, 50
    """
    docs = []
    for idx in range(0, 100):
        doc = Document()
        doc.id = f'{idx:0>16}'
        doc.tags['groundtruth'] = True
        doc.text = 'aa'
        if idx not in (5, 10, 50):
            docs.append(doc)
    return docs
Example #24
0
 def documents(embedding_cls_type):
     doc = Document()
     for c in range(10):
         chunk = Document()
         chunk.id = str(c) * 16
         dense_embedding = np.random.random([10])
         if embedding_cls_type == 'dense':
             chunk.embedding = dense_embedding
         elif embedding_cls_type == 'scipy_csr':
             chunk.embedding = scipy.sparse.csr_matrix(dense_embedding)
         elif embedding_cls_type == 'scipy_coo':
             chunk.embedding = scipy.sparse.coo_matrix(dense_embedding)
         elif embedding_cls_type == 'torch':
             sparse_embedding = scipy.sparse.coo_matrix(dense_embedding)
             values = sparse_embedding.data
             indices = np.vstack(
                 (sparse_embedding.row, sparse_embedding.col))
             chunk.embedding = torch.sparse_coo_tensor(
                 indices,
                 values,
                 sparse_embedding.shape,
             )
         elif embedding_cls_type == 'tf':
             sparse_embedding = scipy.sparse.coo_matrix(dense_embedding)
             values = sparse_embedding.data
             indices = [
                 (x, y)
                 for x, y in zip(sparse_embedding.row, sparse_embedding.col)
             ]
             chunk.embedding = tf.SparseTensor(
                 indices=indices,
                 values=values,
                 dense_shape=[1, 10],
             )
         doc.chunks.append(chunk)
     return doc
Example #25
0
def create_document(doc_id, text, weight):
    d = Document()
    d.id = str(doc_id)
    d.buffer = text.encode('utf8')
    d.weight = weight
    return d
Example #26
0
def get_docs_to_index(contents):
    for i, content in enumerate(contents):
        document = Document()
        document.id = str(f'{i}' * 16)
        document.text = content
        yield document