def test_mini_greek_search_corpus_freqs(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search( results_id=results_id, status=Search.INIT, msg='', # see tesserae.utils.search for how to actually set up Search ) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result.id, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=10, stopword_basis='corpus', score_basis='stem', frequency_basis='corpus', max_distance=10, distance_metric='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, results_id) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_corpus_results.tab') _check_search_results(v5_results, v3_results)
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_latin_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result.id, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'lemmata', stopwords=['et', 'neque', 'qui'], stopword_basis='texts', score_basis='stem', frequency_basis='texts', max_distance=10, distance_metric='frequency', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, results_id) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_latin_results.tab') _check_search_results(v5_results, v3_results)
def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result.id, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=[ 'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ', 'οὗτος', 'ἐμός' ], stopword_basis='texts', score_basis='stem', frequency_basis='texts', max_distance=10, distance_metric='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, results_id) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_results.tab') print(len(v5_results), len(v3_results)) _check_search_results(v5_results, v3_results)
def test_greek_semantic(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'semantic', stopwords=[ 'τις', 'οὗτος', 'καί', 'αβγ', 'ἐγώ', 'τηνόθι', 'τηνικαῦτα', 'τέκνον' ], stopword_basis='texts', score_basis='stem', freq_basis='texts', max_distance=999, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_syn.tab') print(len(v5_results), len(v3_results)) _check_search_results(v5_results, v3_results)
def test_lucverg(lucvergpop, lucverg_metadata): texts = lucvergpop.find(Text.collection, title=[m['title'] for m in lucverg_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) lucvergpop.insert(search_result) matcher = SparseMatrixSearch(lucvergpop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'lemmata', stopwords=[ "et", "qui", "quis", "in", "sum", "hic", "non", "tu", "neque", "ego" ], stopword_basis='texts', score_basis='stem', freq_basis='texts', max_distance=10, distance_basis='frequency', min_score=0) lucvergpop.insert_nocheck(v5_matches) search_result.status = Search.DONE lucvergpop.update(search_result) v5_results = get_results(lucvergpop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'lucverg_time.tab') _check_search_results(v5_results, v3_results)
def test_greek_sound(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match( search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'sound', stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'], stopword_basis='texts', score_basis='3gr', freq_basis='texts', max_distance=999, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_3gr.tab') for p in v3_results: print('v3 trigrams:', p['matched_features']) for p in v5_results: print('v5 trigrams:', p['matched_features']) print('v5 length:', len(v5_results), 'v3 length:', len(v3_results)) _check_search_results(v5_results, v3_results)
def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata, v3checker): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=[ 'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ', 'οὗτος', 'ἐμός' ], stopword_basis='texts', score_basis='lemmata', freq_basis='texts', max_distance=10, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v3checker.check_search_results(minipop, search_result.id, texts[0].path, 'mini_greek_results.tab')
def test_english(engpop, eng_metadata, v3checker): texts = engpop.find(Text.collection, title=[m['title'] for m in eng_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) engpop.insert(search_result) matcher = SparseMatrixSearch(engpop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'form', stopwords=[ "the", "and", "of", "a", "to", "in", "that", "with", "i", "by", ], stopword_basis='texts', score_basis='form', freq_basis='texts', max_distance=10, distance_basis='frequency', min_score=6.0) engpop.insert_nocheck(v5_matches) search_result.status = Search.DONE engpop.update(search_result) v3checker.check_search_results(engpop, search_result.id, texts[0].path, 'eng_time.tab')
def test_greek_to_latin_corpus_basis(g2lpop, mini_g2l_metadata, v3checker): texts = g2lpop.find(Text.collection, title=[m['title'] for m in mini_g2l_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) g2lpop.insert(search_result) matcher = GreekToLatinSearch(g2lpop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), greek_stopwords=[], latin_stopwords=['et', 'non', 'iam'], freq_basis='corpus', max_distance=999, distance_basis='frequency', min_score=0) g2lpop.insert_nocheck(v5_matches) search_result.status = Search.DONE g2lpop.update(search_result) v3checker.check_search_results(g2lpop, search_result.id, texts[0].path, 'mini_g2l_corpus.tab')
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata, v3checker): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_latin_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'lemmata', stopwords=['et', 'neque', 'qui'], stopword_basis='texts', score_basis='lemmata', freq_basis='texts', max_distance=10, distance_basis='frequency', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v3checker.check_search_results(minipop, search_result.id, texts[0].path, 'mini_latin_results.tab')
def test_greek_sound(minipop, mini_greek_metadata, v3checker): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match( search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'sound', stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'], stopword_basis='texts', score_basis='sound', freq_basis='texts', max_distance=999, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v3checker.check_search_results(minipop, search_result.id, texts[0].path, 'mini_greek_results_3gr.tab')