def populate_database(search_connection, test_data):
    """Set up the database to conduct searches on the test texts.

    Fixtures
    --------
    search_connection
        TessMongoConnection for search unit tests.
    test_data
        Example data for unit testing.
    """
    for text in test_data['texts']:
        tessfile = TessFile(text['path'], metadata=Text(**text))
        search_connection.insert(tessfile.metadata)
        if text['language'] == 'latin':
            tok = LatinTokenizer(search_connection)
        unitizer = Unitizer()
        tokens, tags, features = tok.tokenize(tessfile.read(),
                                              text=tessfile.metadata)
        search_connection.update(features)
        lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata)
        search_connection.insert(lines + phrases)
        search_connection.insert(tokens)

    yield

    search_connection.connection['texts'].delete_many({})
    search_connection.connection['tokens'].delete_many({})
    search_connection.connection['features'].delete_many({})
    search_connection.connection['units'].delete_many({})
    search_connection.connection['matches'].delete_many({})
    search_connection.connection['searches'].delete_many({})
Ejemplo n.º 2
0
def test_unitize_elision_file(unit_connection, tessfiles_greek_path):
    tokenizer = GreekTokenizer(unit_connection)
    t = Text(path=str(tessfiles_greek_path.joinpath('test.elision.tess')),
             language='greek')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 1
Ejemplo n.º 3
0
def test_unitize_notag_file(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(tessfiles_latin_path.joinpath('test.notag.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 1
Ejemplo n.º 4
0
def test_unitize_linebreak_file(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(tessfiles_latin_path.joinpath('test.linebreak.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 1
    first_tag = phrases[0].tags[0]
    for phrase in phrases[1:]:
        assert phrase.tags[0] == first_tag
Ejemplo n.º 5
0
def unitizer_inputs(unit_tessfiles, unit_connection):
    inputs = []
    tokenizer_selector = {
        'latin': LatinTokenizer(unit_connection),
        'greek': GreekTokenizer(unit_connection)
    }
    for t in unit_tessfiles:
        tessfile = TessFile(t.path, metadata=t)
        tokens, tags, features = tokenizer_selector[t.language].tokenize(
            tessfile.read(), text=t)
        features.sort(key=lambda x: x.index)
        inputs.append((tokens, tags, features))
    yield inputs
Ejemplo n.º 6
0
    def test_normalize(self, latin_files, latin_tokens):
        la = self.__test_class__()

        for i in range(len(latin_files)):
            fname = latin_files[i]
            ref_tokens = [t for t in latin_tokens[i] if 'FORM' in t]

            t = TessFile(fname)

            tokens = la.normalize(t.read())

            correct = map(
                lambda x:
                ('FORM' in x[1] and x[0] == x[1]['FORM']) or x[0] == '',
                zip(tokens, ref_tokens))
Ejemplo n.º 7
0
def test_unitize_diacrit_in_latin(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(
        tessfiles_latin_path.joinpath('test.diacrit_in_latin.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    forms = {f.index: f.token for f in features if f.feature == 'form'}
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    for phrase in phrases:
        for t in phrase.tokens:
            cur_form = t['features']['form'][0]
            if cur_form != -1:
                normalized = tokenizer.normalize(t['display'])[0][0]
                assert normalized == forms[cur_form], phrase.snippet
Ejemplo n.º 8
0
def test_unitize_linebreak_end(unit_connection, tessfiles_latin_path):
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(
        tessfiles_latin_path.joinpath('test.linebreak_end.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    print('# lines')
    for line in lines:
        print(line.snippet)
    print('# phrases')
    for phrase in phrases:
        print(phrase.snippet)
    assert len(lines) == 2
Ejemplo n.º 9
0
def test_unitize_nopunctuation_file(unit_connection, tessfiles_latin_path):
    # when there is no ending punctuation despite coming to the end of a poem
    # and another poem starts after a blank line
    tokenizer = LatinTokenizer(unit_connection)
    t = Text(path=str(
        tessfiles_latin_path.joinpath('test.nopunctuation.tess')),
             language='latin')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 68
    for prev_phrase, cur_phrase in zip(phrases[:-1], phrases[1:]):
        if '2.13' in prev_phrase.tags[0] and '2.14' in cur_phrase.tags[0]:
            assert prev_phrase.snippet == 'quin et Prometheus et Pelopis parens / dulci laborem decipitur sono / nec curat Orion leones / aut timidos agitare lyncas / Eheu fugaces, Postume, Postume, / labuntur anni nec pietas moram / rugis et instanti senectae / adferet indomitaeque morti, / non, si trecenis quotquot eunt dies, / amice, places inlacrimabilem / Plutona tauris, qui ter amplum / Geryonen Tityonque tristi / conpescit unda, scilicet omnibus / quicumque terrae munere vescimur / enaviganda, sive reges / sive inopes erimus coloni. / '
            assert cur_phrase.snippet == 'frustra cruento Marte carebimus / fractisque rauci fluctibus Hadriae, / frustra per autumnos nocentem / corporibus metuemus Austrum: / '
            break
Ejemplo n.º 10
0
    def test_tokenize(self, greek_files, greek_tokens, greek_word_frequencies):
        grc = self.__test_class__()

        for k in range(len(greek_files)):
            fname = greek_files[k]
            ref_tokens = [t for t in greek_tokens[k] if 'FORM' in t]
            ref_freqs = greek_word_frequencies[k]

            t = TessFile(fname)

            tokens, frequencies = grc.tokenize(t.read())
            tokens = [
                t for t in tokens
                if re.search('[\w]', t.display, flags=re.UNICODE)
            ]

            correct = map(lambda x: x[0].display == x[1]['DISPLAY'],
                          zip(tokens, ref_tokens))

            if not all(correct):
                print(fname)
                for j in range(len(tokens)):
                    if tokens[j].display != ref_tokens[j]['DISPLAY']:
                        print(ref_tokens[j])
                        print('{}->{}'.format(tokens[j].display,
                                              ref_tokens[j]['DISPLAY']))
                        print('{}->{}'.format(tokens[j].form,
                                              ref_tokens[j]['FORM']))

            assert all(correct)

            correct = map(lambda x: x[0].form == x[1]['FORM'],
                          zip(tokens, ref_tokens))

            if not all(correct):
                print(fname)
                for j in range(len(tokens)):
                    if tokens[j].form != ref_tokens[j]['FORM']:
                        print(ref_tokens[j])
                        print('{}->{}'.format(tokens[j].form,
                                              ref_tokens[j]['FORM']))

            assert all(correct)
Ejemplo n.º 11
0
    def test_tokenize(self, latin_files, latin_tokens, latin_word_frequencies):
        la = self.__test_class__()

        for k in range(len(latin_files)):
            fname = latin_files[k]
            ref_tokens = [t for t in latin_tokens[k] if 'FORM' in t]
            ref_freqs = latin_word_frequencies[k]

            t = TessFile(fname)

            tokens, frequencies = la.tokenize(t.read(), text=t.metadata)
            tokens = [
                t for t in tokens
                if re.search(r'^[a-zA-Z]+$', t.display, flags=re.UNICODE)
            ]

            correct = map(lambda x: x[0].display == x[1]['DISPLAY'],
                          zip(tokens, ref_tokens))

            if not all(correct):
                print(fname)
                for j in range(len(tokens)):
                    if tokens[j].display != ref_tokens[j]['DISPLAY']:
                        print('{}->{}'.format(tokens[j].display,
                                              ref_tokens[j]['DISPLAY']))

            assert all(correct)

            correct = map(
                lambda x: ('FORM' in x[1] and x[0].form == x[1]['FORM']) or
                not x[0].form, zip(tokens, ref_tokens))

            if not all(correct):
                print(fname)
                # for j in range(len(tokens)):
                #     if tokens[j].form != ref_tokens[j]['FORM']:
                #         print('{}->{}'.format(tokens[j].form, ref_tokens[j]['FORM']))

            assert all(correct)

            for key in ref_freqs:
                assert key in la.frequencies
                assert la.frequencies[key] == ref_freqs[key]

            diff = []
            for word in frequencies:
                if word.form not in ref_freqs and re.search(
                        r'[a-zA-Z]', word.form, flags=re.UNICODE):
                    diff.append(word.form)
            print(diff)
            assert len(diff) == 0

            keys = sorted(list(ref_freqs.keys()))
            frequencies.sort(key=lambda x: x.form)
            correct = map(
                lambda x: x[0].form == x[1] and x[0].frequency == ref_freqs[x[
                    1]], zip(frequencies, keys))

            assert all(correct)

            la.clear()