def test_mini_greek_search_corpus_freqs(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search( results_id=results_id, status=Search.INIT, msg='', # see tesserae.utils.search for how to actually set up Search ) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result.id, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=10, stopword_basis='corpus', score_basis='stem', frequency_basis='corpus', max_distance=10, distance_metric='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, results_id) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_corpus_results.tab') _check_search_results(v5_results, v3_results)
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_latin_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result.id, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'lemmata', stopwords=['et', 'neque', 'qui'], stopword_basis='texts', score_basis='stem', frequency_basis='texts', max_distance=10, distance_metric='frequency', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, results_id) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_latin_results.tab') _check_search_results(v5_results, v3_results)
def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result.id, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=[ 'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ', 'οὗτος', 'ἐμός' ], stopword_basis='texts', score_basis='stem', frequency_basis='texts', max_distance=10, distance_metric='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, results_id) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_results.tab') print(len(v5_results), len(v3_results)) _check_search_results(v5_results, v3_results)
def test_greek_semantic(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'semantic', stopwords=[ 'τις', 'οὗτος', 'καί', 'αβγ', 'ἐγώ', 'τηνόθι', 'τηνικαῦτα', 'τέκνον' ], stopword_basis='texts', score_basis='stem', freq_basis='texts', max_distance=999, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_syn.tab') print(len(v5_results), len(v3_results)) _check_search_results(v5_results, v3_results)
def test_lucverg(lucvergpop, lucverg_metadata): texts = lucvergpop.find(Text.collection, title=[m['title'] for m in lucverg_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) lucvergpop.insert(search_result) matcher = SparseMatrixSearch(lucvergpop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'lemmata', stopwords=[ "et", "qui", "quis", "in", "sum", "hic", "non", "tu", "neque", "ego" ], stopword_basis='texts', score_basis='stem', freq_basis='texts', max_distance=10, distance_basis='frequency', min_score=0) lucvergpop.insert_nocheck(v5_matches) search_result.status = Search.DONE lucvergpop.update(search_result) v5_results = get_results(lucvergpop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'lucverg_time.tab') _check_search_results(v5_results, v3_results)
def test_latin_trigrams(minipop, mini_latin_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_latin_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) v5_results = [] v3_results = [] raw_v5_results = [] target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound') for b in target_units: raw_v5_results.append(b['features']) raw_v3_results = _load_v3_results(texts[0].path, 'mini_latin_results_3gr.tab') for a in raw_v3_results: v3_results.append(a['matched_features']) print('v5 results:') for a in raw_v5_results: print(a) for n in a: print(n) n = np.asarray(n) print('array', n) print(np.shape(n)) b = get_stoplist_tokens(minipop, n, 'sound', 'latin') v5_results.append(b) print(v5_results) print('v3 results:') for a in v3_results: print(a) print('v5 length:', len(v5_results), 'v3 length:', len(v3_results)) assert False
def test_greek_sound(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match( search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'sound', stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'], stopword_basis='texts', score_basis='3gr', freq_basis='texts', max_distance=999, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_3gr.tab') for p in v3_results: print('v3 trigrams:', p['matched_features']) for p in v5_results: print('v5 trigrams:', p['matched_features']) print('v5 length:', len(v5_results), 'v3 length:', len(v3_results)) _check_search_results(v5_results, v3_results)
def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata, v3checker): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=[ 'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ', 'οὗτος', 'ἐμός' ], stopword_basis='texts', score_basis='lemmata', freq_basis='texts', max_distance=10, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v3checker.check_search_results(minipop, search_result.id, texts[0].path, 'mini_greek_results.tab')
def test_english(engpop, eng_metadata, v3checker): texts = engpop.find(Text.collection, title=[m['title'] for m in eng_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) engpop.insert(search_result) matcher = SparseMatrixSearch(engpop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'form', stopwords=[ "the", "and", "of", "a", "to", "in", "that", "with", "i", "by", ], stopword_basis='texts', score_basis='form', freq_basis='texts', max_distance=10, distance_basis='frequency', min_score=6.0) engpop.insert_nocheck(v5_matches) search_result.status = Search.DONE engpop.update(search_result) v3checker.check_search_results(engpop, search_result.id, texts[0].path, 'eng_time.tab')
def test_greek_to_latin_corpus_basis(g2lpop, mini_g2l_metadata, v3checker): texts = g2lpop.find(Text.collection, title=[m['title'] for m in mini_g2l_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) g2lpop.insert(search_result) matcher = GreekToLatinSearch(g2lpop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), greek_stopwords=[], latin_stopwords=['et', 'non', 'iam'], freq_basis='corpus', max_distance=999, distance_basis='frequency', min_score=0) g2lpop.insert_nocheck(v5_matches) search_result.status = Search.DONE g2lpop.update(search_result) v3checker.check_search_results(g2lpop, search_result.id, texts[0].path, 'mini_g2l_corpus.tab')
def test_greek_trigrams(minipop, mini_greek_metadata): """ For the purpose of visualization. Use to confirm that trigrams are being stored in the database correctly. It should be noted that v5 results do not have stopwords filtered out, while v3 results probably do. """ texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) v5_results = [] v3_results = [] raw_v5_results = [] target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound') for b in target_units: raw_v5_results.append(b['features']) raw_v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_3gr.tab') for a in raw_v3_results: v3_results.append(a['matched_features']) print('v5 results:') for a in raw_v5_results: print(a) for n in a: # print(n) n = np.asarray(n) # print('array',n) # print('shape', np.shape(n)) b = get_stoplist_tokens(minipop, n, 'sound', 'greek') v5_results.append(b) print(v5_results) print('v3 results:') for a in v3_results: print(a) print('v5 length:', len(v5_results), 'v3 length:', len(v3_results)) assert False
def test_mini_punctuation(punctpop, mini_punctuation_metadata): texts = punctpop.find( Text.collection, title=[m['title'] for m in mini_punctuation_metadata]) results_id = uuid.uuid4() search_result = Search( results_id=results_id, status=Search.INIT, msg='', # see tesserae.utils.search for how to actually set up Search ) punctpop.insert(search_result) matcher = SparseMatrixSearch(punctpop) matcher.match(search_result.id, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=10, stopword_basis='corpus', score_basis='stem', frequency_basis='corpus', max_distance=10, distance_metric='span', min_score=0)
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata, v3checker): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_latin_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'lemmata', stopwords=['et', 'neque', 'qui'], stopword_basis='texts', score_basis='lemmata', freq_basis='texts', max_distance=10, distance_basis='frequency', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v3checker.check_search_results(minipop, search_result.id, texts[0].path, 'mini_latin_results.tab')
def test_greek_sound(minipop, mini_greek_metadata, v3checker): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match( search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'sound', stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'], stopword_basis='texts', score_basis='sound', freq_basis='texts', max_distance=999, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v3checker.check_search_results(minipop, search_result.id, texts[0].path, 'mini_greek_results_3gr.tab')