def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result.id,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'lemmata',
                               stopwords=[
                                   'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ',
                                   'οὗτος', 'ἐμός'
                               ],
                               stopword_basis='texts',
                               score_basis='stem',
                               frequency_basis='texts',
                               max_distance=10,
                               distance_metric='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, results_id)
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_greek_results.tab')
    print(len(v5_results), len(v3_results))
    _check_search_results(v5_results, v3_results)
def test_mini_greek_search_corpus_freqs(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(
        results_id=results_id,
        status=Search.INIT,
        msg='',
        # see tesserae.utils.search for how to actually set up Search
    )
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result.id,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'lemmata',
                               stopwords=10,
                               stopword_basis='corpus',
                               score_basis='stem',
                               frequency_basis='corpus',
                               max_distance=10,
                               distance_metric='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, results_id)
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path,
                                  'mini_greek_corpus_results.tab')
    _check_search_results(v5_results, v3_results)
Beispiel #3
0
def test_lucverg(lucvergpop, lucverg_metadata):
    texts = lucvergpop.find(Text.collection,
                            title=[m['title'] for m in lucverg_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    lucvergpop.insert(search_result)
    matcher = SparseMatrixSearch(lucvergpop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'lemmata',
                               stopwords=[
                                   "et", "qui", "quis", "in", "sum", "hic",
                                   "non", "tu", "neque", "ego"
                               ],
                               stopword_basis='texts',
                               score_basis='stem',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='frequency',
                               min_score=0)
    lucvergpop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    lucvergpop.update(search_result)
    v5_results = get_results(lucvergpop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'lucverg_time.tab')
    _check_search_results(v5_results, v3_results)
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_latin_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result.id,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'lemmata',
                               stopwords=['et', 'neque', 'qui'],
                               stopword_basis='texts',
                               score_basis='stem',
                               frequency_basis='texts',
                               max_distance=10,
                               distance_metric='frequency',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, results_id)
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_latin_results.tab')
    _check_search_results(v5_results, v3_results)
Beispiel #5
0
def test_greek_sound(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(
        search_result,
        TextOptions(texts[0], 'phrase'),
        TextOptions(texts[1], 'phrase'),
        'sound',
        stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'],
        stopword_basis='texts',
        score_basis='3gr',
        freq_basis='texts',
        max_distance=999,
        distance_basis='span',
        min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_3gr.tab')
    for p in v3_results:
        print('v3 trigrams:', p['matched_features'])
    for p in v5_results:
        print('v5 trigrams:', p['matched_features'])
    print('v5 length:', len(v5_results), 'v3 length:', len(v3_results))
    _check_search_results(v5_results, v3_results)
Beispiel #6
0
def test_greek_semantic(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'semantic',
                               stopwords=[
                                   'τις', 'οὗτος', 'καί', 'αβγ', 'ἐγώ',
                                   'τηνόθι', 'τηνικαῦτα', 'τέκνον'
                               ],
                               stopword_basis='texts',
                               score_basis='stem',
                               freq_basis='texts',
                               max_distance=999,
                               distance_basis='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_syn.tab')
    print(len(v5_results), len(v3_results))
    _check_search_results(v5_results, v3_results)
Beispiel #7
0
def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata, v3checker):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'lemmata',
                               stopwords=[
                                   'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ',
                                   'οὗτος', 'ἐμός'
                               ],
                               stopword_basis='texts',
                               score_basis='lemmata',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v3checker.check_search_results(minipop, search_result.id, texts[0].path,
                                   'mini_greek_results.tab')
Beispiel #8
0
def test_english(engpop, eng_metadata, v3checker):
    texts = engpop.find(Text.collection,
                        title=[m['title'] for m in eng_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    engpop.insert(search_result)
    matcher = SparseMatrixSearch(engpop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'form',
                               stopwords=[
                                   "the",
                                   "and",
                                   "of",
                                   "a",
                                   "to",
                                   "in",
                                   "that",
                                   "with",
                                   "i",
                                   "by",
                               ],
                               stopword_basis='texts',
                               score_basis='form',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='frequency',
                               min_score=6.0)
    engpop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    engpop.update(search_result)
    v3checker.check_search_results(engpop, search_result.id, texts[0].path,
                                   'eng_time.tab')
def test_match(search_connection, search_tessfiles, correct_results):
    engine = SparseMatrixSearch(search_connection)

    for result in correct_results:
        source = [
            t for t in search_tessfiles if os.path.splitext(
                os.path.basename(t.path))[0] == result['source']
        ][0]
        target = [
            t for t in search_tessfiles if os.path.splitext(
                os.path.basename(t.path))[0] == result['target']
        ][0]

        start = time.time()
        matches, ms = engine.match([source, target],
                                   result['unit'],
                                   result['feature'],
                                   stopwords=10,
                                   stopword_basis='corpus',
                                   score_basis='word',
                                   distance_metric=result['dibasis'],
                                   max_distance=50,
                                   min_score=6)
        print(time.time() - start)

        matches = [lookup_entities(search_connection, m) for m in matches]
        matches.sort(key=lambda x: x.score, reverse=True)

        # print(matches, result)
        # top_matches = [m for m in result['matches'] if m['score'] == 10]
        for i in range(len(matches)):
            predicted = matches[i]
            src = predicted.units[0].tags[0]
            tar = predicted.units[1].tags[0]
            correct = None

            # print(matches[i].units[0].tags, result['matches'][i]['source_locus'])
            # print(matches[i].units[0].tokens, result['matches'][i]['source_text'])
            # print(matches[i].units[1].tags, result['matches'][i]['target_locus'])
            # print(matches[i].units[1].tokens, result['matches'][i]['target_text'])
            # print([t.token for t in matches[i].tokens], result['matches'][i]['shared'])
            # print(matches[i].score, result['matches'][i]['score'])

            for m in result['matches']:
                if m['source_locus'] == src and m['target_locus'] == tar:
                    correct = m
                    break
            # print([t.token for t in predicted.tokens], correct)
            assert correct is not None, "No matching v3 result found."
            assert src == correct['source_locus']

            assert all(
                map(lambda x: x.token in correct['shared'], predicted.tokens))
Beispiel #10
0
def test_greek_multitext_search(minipop):
    feature = 'lemmata'
    language = 'greek'
    texts = minipop.find(Text.collection, language=language)

    results_id = uuid.uuid4()
    search_result = Search(
        results_id=results_id,
        status=Search.INIT,
        msg='',
        # see tesserae.utils.search for how to actually set up Search
    )
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    matches = matcher.match(search_result,
                            TextOptions(texts[0], 'line'),
                            TextOptions(texts[1], 'line'),
                            'lemmata',
                            stopwords=[
                                'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ',
                                'οὗτος', 'ἐμός'
                            ],
                            stopword_basis='corpus',
                            score_basis='lemmata',
                            freq_basis='corpus',
                            max_distance=10,
                            distance_basis='span',
                            min_score=0)

    results = multitext_search(search_result, minipop, matches, feature,
                               'line', texts)
    assert len(results) == len(matches)
    for r, m in zip(results, matches):
        bigrams = [
            bigram
            for bigram in itertools.combinations(sorted(m.matched_features), 2)
        ]
        assert len(bigrams) == len(r)
        for bigram in bigrams:
            assert bigram in r
def test_mini_punctuation(punctpop, mini_punctuation_metadata):
    texts = punctpop.find(
        Text.collection, title=[m['title'] for m in mini_punctuation_metadata])
    results_id = uuid.uuid4()
    search_result = Search(
        results_id=results_id,
        status=Search.INIT,
        msg='',
        # see tesserae.utils.search for how to actually set up Search
    )
    punctpop.insert(search_result)
    matcher = SparseMatrixSearch(punctpop)
    matcher.match(search_result.id,
                  TextOptions(texts[0], 'phrase'),
                  TextOptions(texts[1], 'phrase'),
                  'lemmata',
                  stopwords=10,
                  stopword_basis='corpus',
                  score_basis='stem',
                  frequency_basis='corpus',
                  max_distance=10,
                  distance_metric='span',
                  min_score=0)
Beispiel #12
0
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata, v3checker):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_latin_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'lemmata',
                               stopwords=['et', 'neque', 'qui'],
                               stopword_basis='texts',
                               score_basis='lemmata',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='frequency',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v3checker.check_search_results(minipop, search_result.id, texts[0].path,
                                   'mini_latin_results.tab')
Beispiel #13
0
def test_greek_sound(minipop, mini_greek_metadata, v3checker):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(
        search_result,
        TextOptions(texts[0], 'phrase'),
        TextOptions(texts[1], 'phrase'),
        'sound',
        stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'],
        stopword_basis='texts',
        score_basis='sound',
        freq_basis='texts',
        max_distance=999,
        distance_basis='span',
        min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v3checker.check_search_results(minipop, search_result.id, texts[0].path,
                                   'mini_greek_results_3gr.tab')
def test_get_frequencies(search_connection):
    engine = SparseMatrixSearch(search_connection)
def test_create_stoplist(search_connection):
    engine = SparseMatrixSearch(search_connection)
def test_init(search_connection):
    engine = SparseMatrixSearch(search_connection)
    assert engine.connection is search_connection