def test_mini_greek_search_corpus_freqs(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(
        results_id=results_id,
        status=Search.INIT,
        msg='',
        # see tesserae.utils.search for how to actually set up Search
    )
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result.id,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'lemmata',
                               stopwords=10,
                               stopword_basis='corpus',
                               score_basis='stem',
                               frequency_basis='corpus',
                               max_distance=10,
                               distance_metric='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, results_id)
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path,
                                  'mini_greek_corpus_results.tab')
    _check_search_results(v5_results, v3_results)
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_latin_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result.id,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'lemmata',
                               stopwords=['et', 'neque', 'qui'],
                               stopword_basis='texts',
                               score_basis='stem',
                               frequency_basis='texts',
                               max_distance=10,
                               distance_metric='frequency',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, results_id)
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_latin_results.tab')
    _check_search_results(v5_results, v3_results)
def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result.id,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'lemmata',
                               stopwords=[
                                   'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ',
                                   'οὗτος', 'ἐμός'
                               ],
                               stopword_basis='texts',
                               score_basis='stem',
                               frequency_basis='texts',
                               max_distance=10,
                               distance_metric='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, results_id)
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_greek_results.tab')
    print(len(v5_results), len(v3_results))
    _check_search_results(v5_results, v3_results)
Exemple #4
0
def test_greek_semantic(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'semantic',
                               stopwords=[
                                   'τις', 'οὗτος', 'καί', 'αβγ', 'ἐγώ',
                                   'τηνόθι', 'τηνικαῦτα', 'τέκνον'
                               ],
                               stopword_basis='texts',
                               score_basis='stem',
                               freq_basis='texts',
                               max_distance=999,
                               distance_basis='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_syn.tab')
    print(len(v5_results), len(v3_results))
    _check_search_results(v5_results, v3_results)
Exemple #5
0
def test_lucverg(lucvergpop, lucverg_metadata):
    texts = lucvergpop.find(Text.collection,
                            title=[m['title'] for m in lucverg_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    lucvergpop.insert(search_result)
    matcher = SparseMatrixSearch(lucvergpop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'lemmata',
                               stopwords=[
                                   "et", "qui", "quis", "in", "sum", "hic",
                                   "non", "tu", "neque", "ego"
                               ],
                               stopword_basis='texts',
                               score_basis='stem',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='frequency',
                               min_score=0)
    lucvergpop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    lucvergpop.update(search_result)
    v5_results = get_results(lucvergpop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'lucverg_time.tab')
    _check_search_results(v5_results, v3_results)
Exemple #6
0
def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata, v3checker):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'lemmata',
                               stopwords=[
                                   'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ',
                                   'οὗτος', 'ἐμός'
                               ],
                               stopword_basis='texts',
                               score_basis='lemmata',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v3checker.check_search_results(minipop, search_result.id, texts[0].path,
                                   'mini_greek_results.tab')
Exemple #7
0
def test_greek_sound(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(
        search_result,
        TextOptions(texts[0], 'phrase'),
        TextOptions(texts[1], 'phrase'),
        'sound',
        stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'],
        stopword_basis='texts',
        score_basis='3gr',
        freq_basis='texts',
        max_distance=999,
        distance_basis='span',
        min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_3gr.tab')
    for p in v3_results:
        print('v3 trigrams:', p['matched_features'])
    for p in v5_results:
        print('v5 trigrams:', p['matched_features'])
    print('v5 length:', len(v5_results), 'v3 length:', len(v3_results))
    _check_search_results(v5_results, v3_results)
Exemple #8
0
def test_english(engpop, eng_metadata, v3checker):
    texts = engpop.find(Text.collection,
                        title=[m['title'] for m in eng_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    engpop.insert(search_result)
    matcher = SparseMatrixSearch(engpop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'form',
                               stopwords=[
                                   "the",
                                   "and",
                                   "of",
                                   "a",
                                   "to",
                                   "in",
                                   "that",
                                   "with",
                                   "i",
                                   "by",
                               ],
                               stopword_basis='texts',
                               score_basis='form',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='frequency',
                               min_score=6.0)
    engpop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    engpop.update(search_result)
    v3checker.check_search_results(engpop, search_result.id, texts[0].path,
                                   'eng_time.tab')
Exemple #9
0
def test_latin_trigrams(minipop, mini_latin_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_latin_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    v5_results = []
    v3_results = []
    raw_v5_results = []
    target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound')
    for b in target_units:
        raw_v5_results.append(b['features'])
    raw_v3_results = _load_v3_results(texts[0].path,
                                      'mini_latin_results_3gr.tab')
    for a in raw_v3_results:
        v3_results.append(a['matched_features'])
    print('v5 results:')
    for a in raw_v5_results:
        print(a)
        for n in a:
            print(n)
            n = np.asarray(n)
            print('array', n)
            print(np.shape(n))
            b = get_stoplist_tokens(minipop, n, 'sound', 'latin')
            v5_results.append(b)
    print(v5_results)
    print('v3 results:')
    for a in v3_results:
        print(a)
    print('v5 length:', len(v5_results), 'v3 length:', len(v3_results))
    assert False
Exemple #10
0
def test_greek_to_latin_corpus_basis(g2lpop, mini_g2l_metadata, v3checker):
    texts = g2lpop.find(Text.collection,
                        title=[m['title'] for m in mini_g2l_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    g2lpop.insert(search_result)
    matcher = GreekToLatinSearch(g2lpop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               greek_stopwords=[],
                               latin_stopwords=['et', 'non', 'iam'],
                               freq_basis='corpus',
                               max_distance=999,
                               distance_basis='frequency',
                               min_score=0)
    g2lpop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    g2lpop.update(search_result)
    v3checker.check_search_results(g2lpop, search_result.id, texts[0].path,
                                   'mini_g2l_corpus.tab')
Exemple #11
0
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata, v3checker):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_latin_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'lemmata',
                               stopwords=['et', 'neque', 'qui'],
                               stopword_basis='texts',
                               score_basis='lemmata',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='frequency',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v3checker.check_search_results(minipop, search_result.id, texts[0].path,
                                   'mini_latin_results.tab')
Exemple #12
0
def test_greek_sound(minipop, mini_greek_metadata, v3checker):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(
        search_result,
        TextOptions(texts[0], 'phrase'),
        TextOptions(texts[1], 'phrase'),
        'sound',
        stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'],
        stopword_basis='texts',
        score_basis='sound',
        freq_basis='texts',
        max_distance=999,
        distance_basis='span',
        min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v3checker.check_search_results(minipop, search_result.id, texts[0].path,
                                   'mini_greek_results_3gr.tab')
Exemple #13
0
def test_greek_trigrams(minipop, mini_greek_metadata):
    """
    For the purpose of visualization.
    Use to confirm that trigrams are being stored in the database correctly.
    It should be noted that v5 results do not have stopwords filtered out,
    while v3 results probably do.
    """
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    v5_results = []
    v3_results = []
    raw_v5_results = []
    target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound')
    for b in target_units:
        raw_v5_results.append(b['features'])
    raw_v3_results = _load_v3_results(texts[0].path,
                                      'mini_greek_results_3gr.tab')
    for a in raw_v3_results:
        v3_results.append(a['matched_features'])
    print('v5 results:')
    for a in raw_v5_results:
        print(a)
        for n in a:
            #            print(n)
            n = np.asarray(n)
            #            print('array',n)
            #            print('shape', np.shape(n))
            b = get_stoplist_tokens(minipop, n, 'sound', 'greek')
            v5_results.append(b)
    print(v5_results)
    print('v3 results:')
    for a in v3_results:
        print(a)
    print('v5 length:', len(v5_results), 'v3 length:', len(v3_results))
    assert False
def test_mini_punctuation(punctpop, mini_punctuation_metadata):
    texts = punctpop.find(
        Text.collection, title=[m['title'] for m in mini_punctuation_metadata])
    results_id = uuid.uuid4()
    search_result = Search(
        results_id=results_id,
        status=Search.INIT,
        msg='',
        # see tesserae.utils.search for how to actually set up Search
    )
    punctpop.insert(search_result)
    matcher = SparseMatrixSearch(punctpop)
    matcher.match(search_result.id,
                  TextOptions(texts[0], 'phrase'),
                  TextOptions(texts[1], 'phrase'),
                  'lemmata',
                  stopwords=10,
                  stopword_basis='corpus',
                  score_basis='stem',
                  frequency_basis='corpus',
                  max_distance=10,
                  distance_metric='span',
                  min_score=0)