def test_unitize_elision_file(unit_connection, tessfiles_greek_path):
    tokenizer = GreekTokenizer(unit_connection)
    t = Text(path=str(tessfiles_greek_path.joinpath('test.elision.tess')),
             language='greek')
    tessfile = TessFile(t.path, metadata=t)
    unitizer = Unitizer()
    tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t)
    lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text)
    assert len(lines) == 1
Example #2
0
def test_normalize(token_connection, greek_tessfiles, greek_tokens):
    grc = GreekTokenizer(token_connection)

    for i, tessfile in enumerate(greek_tessfiles):
        correct_tokens = [t for t in greek_tokens[i] if t['form']]
        tokens, tags = grc.normalize(tessfile.read())
        tokens = [t for t in tokens if re.search(r'[\w]+', t)]
        correct = map(lambda x: x[0] == x[1]['form'],
                      zip(tokens, correct_tokens))
        for j, c in enumerate(correct):
            if not c:
                print(j, tokens[j], correct_tokens[j])
                break
        assert all(correct)

        for i, line in enumerate(tessfile.readlines()):
            correct_tag = line[:line.find('>') + 1]
            assert tags[i] == correct_tag
def unitizer_inputs(unit_tessfiles, unit_connection):
    inputs = []
    tokenizer_selector = {
        'latin': LatinTokenizer(unit_connection),
        'greek': GreekTokenizer(unit_connection)
    }
    for t in unit_tessfiles:
        tessfile = TessFile(t.path, metadata=t)
        tokens, tags, features = tokenizer_selector[t.language].tokenize(
            tessfile.read(), text=t)
        features.sort(key=lambda x: x.index)
        inputs.append((tokens, tags, features))
    yield inputs
Example #4
0
def test_tokenize(token_connection, greek_tessfiles, greek_tokens):
    grc = GreekTokenizer(token_connection)

    for i, tessfile in enumerate(greek_tessfiles):
        print(tessfile.metadata.title)
        tokens, tags, features = grc.tokenize(tessfile.read(),
                                              text=tessfile.metadata)
        tokens = [t for t in tokens if re.search(r'[\w]+', t.display)]

        for j, token in enumerate(tokens):
            # Detect all connected
            assert token.display == greek_tokens[i][j]['display']
            # if tessfile.metadata.title == 'gorgias':
            #     print(token.display, greek_tokens[i][j])
            # print(token.display, token.features['form'].token, [t.token for t in token.features['lemmata']])
            # print(greek_tokens[i][j])
            assert token.features['form'].token == greek_tokens[i][j]['form']
            assert all([
                any(
                    map(lambda x: lemma.token == x,
                        greek_tokens[i][j]['lemmata']))
                for lemma in token.features['lemmata']
            ])
Example #5
0
def greek_word_frequencies(greek_files):
    freqs = []
    grc = GreekTokenizer()
    for fname in greek_files:
        freq = {}
        fname = os.path.splitext(fname)[0] + '.freq_score_word'
        with open(fname, 'r') as f:
            for line in f.readlines():
                if '#' not in line:
                    word, n = re.split('[^\w' + grc.diacriticals + ']+',
                                       line,
                                       flags=re.UNICODE)[:-1]
                    freq[word] = int(n)
        freqs.append(freq)
    return freqs
Example #6
0
    def test_unitize(self, units):
        for unit in units:
            u = Unitizer()
            metadata = unit['metadata']
            tess = TessFile(metadata.path, metadata=metadata)
            tokens = unit['tokens']
            lines = unit['lines']
            phrases = unit['phrases']

            if metadata.language == 'greek':
                tokenizer = GreekTokenizer()
            elif metadata.language == 'latin':
                tokenizer = LatinTokenizer()

            tokenizer.clear()

            for i, line in enumerate(tess.readlines(include_tag=False)):
                stop = (i == len(tess) - 1)
                u.unitize(line, metadata, tokenizer=tokenizer, stop=stop)

            print(metadata.path)

            assert len(u.lines) == len(lines)
            for i in range(len(lines)):
                line_tokens = \
                    [tokenizer.tokens[j].form for j in u.lines[i].tokens
                     if re.search(r'[\w\d]', tokenizer.tokens[j].display,
                                  flags=re.UNICODE) and
                        tokenizer.tokens[j].form]

                correct_tokens = \
                    [tokens[j]['FORM'] for j in lines[i]['TOKEN_ID']
                     if 'FORM' in tokens[j] and tokens[j]['FORM']]

                if line_tokens != correct_tokens:
                    print('Line {}'.format(i))
                    print(line_tokens)
                    print(correct_tokens)

                assert line_tokens == correct_tokens

            print(u.phrases[-1].tokens)
            assert len(u.phrases) == len(phrases)
            for i in range(len(u.phrases)):
                phrase_tokens = \
                    [tokenizer.tokens[j].form for j in u.phrases[i].tokens
                     if re.search(r'[\w\d]', tokenizer.tokens[j].display,
                                  flags=re.UNICODE) and
                        tokenizer.tokens[j].form]

                correct_tokens = \
                    [tokens[j]['FORM'] for j in phrases[i]['TOKEN_ID']
                     if 'FORM' in tokens[j] and tokens[j]['FORM']]

                if phrase_tokens != correct_tokens:
                    print('Phrase {}'.format(i))
                    phrase_tokens = \
                        [tokenizer.tokens[j].form for j in u.phrases[i - 1].tokens
                         if re.search(r'[\w]', tokenizer.tokens[j].display,
                                      flags=re.UNICODE) and
                            tokenizer.tokens[j].form]

                    correct_tokens = \
                        [tokens[j]['FORM'] for j in phrases[i - 1]['TOKEN_ID']
                         if 'FORM' in tokens[j]]
                    print(phrase_tokens)
                    print(correct_tokens)

                assert phrase_tokens == correct_tokens

            assert len(u.phrases) == len(phrases)

            u.clear()
            tokenizer.clear()
Example #7
0
def test_init(token_connection):
    t = GreekTokenizer(token_connection)
    assert t.connection is token_connection
    assert hasattr(t, 'lemmatizer')
    assert isinstance(t.lemmatizer, Lemmata)