def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result.id, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=[ 'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ', 'οὗτος', 'ἐμός' ], stopword_basis='texts', score_basis='stem', frequency_basis='texts', max_distance=10, distance_metric='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, results_id) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_results.tab') print(len(v5_results), len(v3_results)) _check_search_results(v5_results, v3_results)
def test_mini_greek_search_corpus_freqs(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search( results_id=results_id, status=Search.INIT, msg='', # see tesserae.utils.search for how to actually set up Search ) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result.id, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=10, stopword_basis='corpus', score_basis='stem', frequency_basis='corpus', max_distance=10, distance_metric='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, results_id) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_corpus_results.tab') _check_search_results(v5_results, v3_results)
def test_lucverg(lucvergpop, lucverg_metadata): texts = lucvergpop.find(Text.collection, title=[m['title'] for m in lucverg_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) lucvergpop.insert(search_result) matcher = SparseMatrixSearch(lucvergpop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'lemmata', stopwords=[ "et", "qui", "quis", "in", "sum", "hic", "non", "tu", "neque", "ego" ], stopword_basis='texts', score_basis='stem', freq_basis='texts', max_distance=10, distance_basis='frequency', min_score=0) lucvergpop.insert_nocheck(v5_matches) search_result.status = Search.DONE lucvergpop.update(search_result) v5_results = get_results(lucvergpop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'lucverg_time.tab') _check_search_results(v5_results, v3_results)
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_latin_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result.id, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'lemmata', stopwords=['et', 'neque', 'qui'], stopword_basis='texts', score_basis='stem', frequency_basis='texts', max_distance=10, distance_metric='frequency', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, results_id) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_latin_results.tab') _check_search_results(v5_results, v3_results)
def test_greek_sound(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match( search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'sound', stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'], stopword_basis='texts', score_basis='3gr', freq_basis='texts', max_distance=999, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_3gr.tab') for p in v3_results: print('v3 trigrams:', p['matched_features']) for p in v5_results: print('v5 trigrams:', p['matched_features']) print('v5 length:', len(v5_results), 'v3 length:', len(v3_results)) _check_search_results(v5_results, v3_results)
def test_greek_semantic(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'semantic', stopwords=[ 'τις', 'οὗτος', 'καί', 'αβγ', 'ἐγώ', 'τηνόθι', 'τηνικαῦτα', 'τέκνον' ], stopword_basis='texts', score_basis='stem', freq_basis='texts', max_distance=999, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_syn.tab') print(len(v5_results), len(v3_results)) _check_search_results(v5_results, v3_results)
def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata, v3checker): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=[ 'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ', 'οὗτος', 'ἐμός' ], stopword_basis='texts', score_basis='lemmata', freq_basis='texts', max_distance=10, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v3checker.check_search_results(minipop, search_result.id, texts[0].path, 'mini_greek_results.tab')
def test_english(engpop, eng_metadata, v3checker): texts = engpop.find(Text.collection, title=[m['title'] for m in eng_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) engpop.insert(search_result) matcher = SparseMatrixSearch(engpop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'form', stopwords=[ "the", "and", "of", "a", "to", "in", "that", "with", "i", "by", ], stopword_basis='texts', score_basis='form', freq_basis='texts', max_distance=10, distance_basis='frequency', min_score=6.0) engpop.insert_nocheck(v5_matches) search_result.status = Search.DONE engpop.update(search_result) v3checker.check_search_results(engpop, search_result.id, texts[0].path, 'eng_time.tab')
def test_match(search_connection, search_tessfiles, correct_results): engine = SparseMatrixSearch(search_connection) for result in correct_results: source = [ t for t in search_tessfiles if os.path.splitext( os.path.basename(t.path))[0] == result['source'] ][0] target = [ t for t in search_tessfiles if os.path.splitext( os.path.basename(t.path))[0] == result['target'] ][0] start = time.time() matches, ms = engine.match([source, target], result['unit'], result['feature'], stopwords=10, stopword_basis='corpus', score_basis='word', distance_metric=result['dibasis'], max_distance=50, min_score=6) print(time.time() - start) matches = [lookup_entities(search_connection, m) for m in matches] matches.sort(key=lambda x: x.score, reverse=True) # print(matches, result) # top_matches = [m for m in result['matches'] if m['score'] == 10] for i in range(len(matches)): predicted = matches[i] src = predicted.units[0].tags[0] tar = predicted.units[1].tags[0] correct = None # print(matches[i].units[0].tags, result['matches'][i]['source_locus']) # print(matches[i].units[0].tokens, result['matches'][i]['source_text']) # print(matches[i].units[1].tags, result['matches'][i]['target_locus']) # print(matches[i].units[1].tokens, result['matches'][i]['target_text']) # print([t.token for t in matches[i].tokens], result['matches'][i]['shared']) # print(matches[i].score, result['matches'][i]['score']) for m in result['matches']: if m['source_locus'] == src and m['target_locus'] == tar: correct = m break # print([t.token for t in predicted.tokens], correct) assert correct is not None, "No matching v3 result found." assert src == correct['source_locus'] assert all( map(lambda x: x.token in correct['shared'], predicted.tokens))
def test_greek_multitext_search(minipop): feature = 'lemmata' language = 'greek' texts = minipop.find(Text.collection, language=language) results_id = uuid.uuid4() search_result = Search( results_id=results_id, status=Search.INIT, msg='', # see tesserae.utils.search for how to actually set up Search ) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'lemmata', stopwords=[ 'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ', 'οὗτος', 'ἐμός' ], stopword_basis='corpus', score_basis='lemmata', freq_basis='corpus', max_distance=10, distance_basis='span', min_score=0) results = multitext_search(search_result, minipop, matches, feature, 'line', texts) assert len(results) == len(matches) for r, m in zip(results, matches): bigrams = [ bigram for bigram in itertools.combinations(sorted(m.matched_features), 2) ] assert len(bigrams) == len(r) for bigram in bigrams: assert bigram in r
def test_mini_punctuation(punctpop, mini_punctuation_metadata): texts = punctpop.find( Text.collection, title=[m['title'] for m in mini_punctuation_metadata]) results_id = uuid.uuid4() search_result = Search( results_id=results_id, status=Search.INIT, msg='', # see tesserae.utils.search for how to actually set up Search ) punctpop.insert(search_result) matcher = SparseMatrixSearch(punctpop) matcher.match(search_result.id, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=10, stopword_basis='corpus', score_basis='stem', frequency_basis='corpus', max_distance=10, distance_metric='span', min_score=0)
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata, v3checker): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_latin_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'lemmata', stopwords=['et', 'neque', 'qui'], stopword_basis='texts', score_basis='lemmata', freq_basis='texts', max_distance=10, distance_basis='frequency', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v3checker.check_search_results(minipop, search_result.id, texts[0].path, 'mini_latin_results.tab')
def test_greek_sound(minipop, mini_greek_metadata, v3checker): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match( search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'sound', stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'], stopword_basis='texts', score_basis='sound', freq_basis='texts', max_distance=999, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v3checker.check_search_results(minipop, search_result.id, texts[0].path, 'mini_greek_results_3gr.tab')
def test_get_frequencies(search_connection): engine = SparseMatrixSearch(search_connection)
def test_create_stoplist(search_connection): engine = SparseMatrixSearch(search_connection)
def test_init(search_connection): engine = SparseMatrixSearch(search_connection) assert engine.connection is search_connection