Ejemplo n.º 1
0
    def test_readlines(self, tessfile_list):
        for f in tessfile_list:
            lines = []
            with open(f, 'r') as tess:
                for line in tess.readlines():
                    lines.append(line)

            # Ensure that readlines works with a buffer
            t = TessFile(f)
            for i, line in enumerate(t.readlines()):
                assert line == lines[i]

            # Ensure that the buffer resets on second call
            reset = False
            for i, line in enumerate(t.readlines()):
                assert line == lines[i]
                reset = True
            assert reset

            # Ensure that readlines works with initial read
            t = TessFile(f, buffer=False)
            for i, line in enumerate(t.readlines()):
                assert line == lines[i]

            # Ensure that the iterator resets on second call
            reset = False
            for i, line in enumerate(t.readlines()):
                assert line == lines[i]
                reset = True
            assert reset
Ejemplo n.º 2
0
    def test_normalize(self, greek_files, greek_tokens):
        grc = self.__test_class__()

        for i in range(len(greek_files)):
            fname = greek_files[i]
            ref_tokens = [t for t in greek_tokens[i] if t['FORM'] != '']

            t = TessFile(fname)

            token_idx = 0

            for i, line in enumerate(t.readlines(include_tag=False)):
                tokens = [t for t in grc.normalize(line)]
                tokens = [
                    t for t in tokens if re.search(
                        '[' + grc.word_characters + ']+', t, flags=re.UNICODE)
                ]

                offset = token_idx + len(tokens)

                correct = map(lambda x: x[0] == x[1]['FORM'],
                              zip(tokens, ref_tokens[token_idx:offset]))

                if not all(correct):
                    print(fname, i, line)
                    print(ref_tokens[token_idx:offset])
                    for j in range(len(tokens)):
                        if tokens[j] != ref_tokens[token_idx + j]['FORM']:
                            print('{}->{}'.format(
                                tokens[j], ref_tokens[token_idx + j]['FORM']))

                assert all(correct)

                token_idx = offset
Ejemplo n.º 3
0
    def test_unitize(self, units):
        for unit in units:
            u = Unitizer()
            metadata = unit['metadata']
            tess = TessFile(metadata.path, metadata=metadata)
            tokens = unit['tokens']
            lines = unit['lines']
            phrases = unit['phrases']

            if metadata.language == 'greek':
                tokenizer = GreekTokenizer()
            elif metadata.language == 'latin':
                tokenizer = LatinTokenizer()

            tokenizer.clear()

            for i, line in enumerate(tess.readlines(include_tag=False)):
                stop = (i == len(tess) - 1)
                u.unitize(line, metadata, tokenizer=tokenizer, stop=stop)

            print(metadata.path)

            assert len(u.lines) == len(lines)
            for i in range(len(lines)):
                line_tokens = \
                    [tokenizer.tokens[j].form for j in u.lines[i].tokens
                     if re.search(r'[\w\d]', tokenizer.tokens[j].display,
                                  flags=re.UNICODE) and
                        tokenizer.tokens[j].form]

                correct_tokens = \
                    [tokens[j]['FORM'] for j in lines[i]['TOKEN_ID']
                     if 'FORM' in tokens[j] and tokens[j]['FORM']]

                if line_tokens != correct_tokens:
                    print('Line {}'.format(i))
                    print(line_tokens)
                    print(correct_tokens)

                assert line_tokens == correct_tokens

            print(u.phrases[-1].tokens)
            assert len(u.phrases) == len(phrases)
            for i in range(len(u.phrases)):
                phrase_tokens = \
                    [tokenizer.tokens[j].form for j in u.phrases[i].tokens
                     if re.search(r'[\w\d]', tokenizer.tokens[j].display,
                                  flags=re.UNICODE) and
                        tokenizer.tokens[j].form]

                correct_tokens = \
                    [tokens[j]['FORM'] for j in phrases[i]['TOKEN_ID']
                     if 'FORM' in tokens[j] and tokens[j]['FORM']]

                if phrase_tokens != correct_tokens:
                    print('Phrase {}'.format(i))
                    phrase_tokens = \
                        [tokenizer.tokens[j].form for j in u.phrases[i - 1].tokens
                         if re.search(r'[\w]', tokenizer.tokens[j].display,
                                      flags=re.UNICODE) and
                            tokenizer.tokens[j].form]

                    correct_tokens = \
                        [tokens[j]['FORM'] for j in phrases[i - 1]['TOKEN_ID']
                         if 'FORM' in tokens[j]]
                    print(phrase_tokens)
                    print(correct_tokens)

                assert phrase_tokens == correct_tokens

            assert len(u.phrases) == len(phrases)

            u.clear()
            tokenizer.clear()