def create_document(doc_id, text, weight, length): d = Document() d._document.id = (str(doc_id) * 16)[:16] d.buffer = text.encode('utf8') d.weight = weight d.length = length return d
def test_segment_driver(): valid_doc = Document() valid_doc.text = 'valid' valid_doc.length = 2 valid_doc.mime_type = 'image/png' driver = SimpleSegmentDriver() executor = MockSegmenter() driver.attach(executor=executor, runtime=None) driver._apply_all(DocumentSet([valid_doc])) assert valid_doc.length == 2 assert valid_doc.chunks[0].tags['id'] == 3 assert valid_doc.chunks[0].parent_id == valid_doc.id np.testing.assert_equal(valid_doc.chunks[0].blob, np.array([0.0, 0.0, 0.0])) assert valid_doc.chunks[0].weight == 0.0 assert valid_doc.chunks[0].length == 3 assert valid_doc.chunks[0].mime_type == 'text/plain' assert valid_doc.chunks[1].tags['id'] == 4 assert valid_doc.chunks[1].parent_id == valid_doc.id np.testing.assert_equal(valid_doc.chunks[1].blob, np.array([1.0, 1.0, 1.0])) assert valid_doc.chunks[1].weight == 1.0 assert valid_doc.chunks[1].length == 3 assert valid_doc.chunks[1].mime_type == 'image/png' assert valid_doc.chunks[2].tags['id'] == 5 assert valid_doc.chunks[2].parent_id == valid_doc.id np.testing.assert_equal(valid_doc.chunks[2].blob, np.array([2.0, 2.0, 2.0])) assert valid_doc.chunks[2].weight == 2.0 assert valid_doc.chunks[2].length == 3 assert valid_doc.chunks[2].mime_type == 'image/png'
def create_chunk_matches_to_score(): # doc: (id: 100, granularity=0) # |- chunks: (id: 10) # | |- matches: (id: 11, parent_id: 1, score.value: 2), # | |- matches: (id: 12, parent_id: 1, score.value: 3), # |- chunks: (id: 20) # |- matches: (id: 21, parent_id: 2, score.value: 4), # |- matches: (id: 22, parent_id: 2, score.value: 5) doc = Document() doc.id = '1' doc.granularity = 0 num_matches = 2 for parent_id in range(1, 3): chunk = Document() chunk_id = parent_id * 10 chunk.id = str(chunk_id) chunk.granularity = doc.granularity + 1 for score_value in range(parent_id * 2, parent_id * 2 + num_matches): match = Document() match.granularity = chunk.granularity match.parent_id = str(parent_id) match.score = NamedScore(value=score_value, ref_id=chunk.id) match.id = str(10 * int(parent_id) + score_value) match.length = 4 chunk.matches.append(match) doc.chunks.append(chunk) return doc
def create_document_to_score(): # doc: 1 # |- chunk: 2 # | |- matches: (id: 4, parent_id: 40, score.value: 4), # | |- matches: (id: 5, parent_id: 50, score.value: 5), # | # |- chunk: 3 # |- matches: (id: 6, parent_id: 60, score.value: 6), # |- matches: (id: 7, parent_id: 70, score.value: 7) doc = Document() doc.id = '1' for c in range(2): chunk = Document() chunk_id = str(c + 2) chunk.id = chunk_id for m in range(2): match = Document() match_id = 2 * int(chunk_id) + m match.id = str(match_id) parent_id = 10 * int(match_id) match.parent_id = str(parent_id) match.length = int(match_id) # to be used by MaxRanker and MinRanker match.score = NamedScore(value=int(match_id), ref_id=chunk.id) match.tags['price'] = match.score.value match.tags['discount'] = DISCOUNT_VAL chunk.matches.append(match) doc.chunks.append(chunk) return doc
def create_document(doc_id, text, weight, length): d = Document() d.id = doc_id d.buffer = text.encode('utf8') d.weight = weight d.length = length return d
def test_broken_document(): driver = SimpleSegmentDriver() executor = MockSegmenter() driver.attach(executor=executor, pea=None) invalid_doc = Document() invalid_doc.id = 1 invalid_doc.text = 'invalid' invalid_doc.length = 2 assert invalid_doc.length == 2 with pytest.raises(AttributeError): driver._apply_all([invalid_doc])
def create_document_to_score(): # doc: 1 # |- matches: (id: 2, parent_id: 1, score.value: 2), # |- matches: (id: 3, parent_id: 1, score.value: 3), # |- matches: (id: 4, parent_id: 1, score.value: 4), # |- matches: (id: 5, parent_id: 1, score.value: 5), doc = Document() doc.id = '1' * 16 doc.length = 5 for match_id, match_score in [(2, 3), (3, 6), (4, 1), (5, 8)]: with Document() as match: match.id = str(match_id) * 16 match.length = match_score match.score.value = match_score doc.matches.append(match) return doc
def create_document_to_score(): # doc: 1 # |- matches: (id: 2, parent_id: 1, score.value: 2), # |- matches: (id: 3, parent_id: 1, score.value: 3), # |- matches: (id: 4, parent_id: 1, score.value: 4), # |- matches: (id: 5, parent_id: 1, score.value: 5), doc = Document() doc.id = '1' * 20 doc.length = 5 for match_id, match_score, match_length in [ (2, 3, 16), (3, 6, 24), (4, 1, 8), (5, 8, 16), ]: with Document() as match: match.id = str(match_id) * match_length match.length = match_score match.score = NamedScore(value=match_score, ref_id=doc.id) doc.matches.append(match) return doc
def create_document_to_score_same_depth_level(): # doc: 1 # | matches: (id: 2, parent_id: 20, score.value: 30, length: 3), # | matches: (id: 3, parent_id: 20, score.value: 40, length: 4), # | matches: (id: 4, parent_id: 30, score.value: 20, length: 2), # | matches: (id: 5, parent_id: 30, score.value: 10, length: 1), doc = Document() doc.id = 1 for match_id, parent_id, match_score, match_length in [ (2, 20, 30, 3), (3, 20, 40, 4), (4, 30, 20, 2), (5, 30, 10, 1), ]: match = Document() match.id = match_id match.parent_id = parent_id match.length = match_length match.score = NamedScore(value=match_score, ref_id=doc.id) doc.matches.append(match) return doc