Python Tokenizer Exemples, CorrectOCR.tokens.Tokenizer Python Exemples

Exemple #1

0

Afficher le fichier

    def test_last_modified(self):
        t = Tokenizer.for_extension('.txt')(language=MockLang('english'))

        f = MockCorpusFile('test')
        tokens = t.tokenize(f, MockConfig(type='fs'))
        token = tokens[0]

        token.gold = token.original
        self.assertEqual(
            token.gold, token.original,
            f'Resulting token.gold should be identical with original: {vars(token)}'
        )
        self.assertFalse(
            token.is_discarded,
            f'Resulting token should not be discarded: {vars(token)}')
        last_modified = token.last_modified

        token.is_discarded = True
        self.assertTrue(token.is_discarded,
                        f'Resulting token should be discarded: {vars(token)}')
        self.assertEqual(
            token.gold, '',
            f'Resulting token.gold should be cleared: {vars(token)}')

        self.assertTrue(
            token.last_modified > last_modified,
            f'Resulting token should have updated last_modified (was {last_modified}): {vars(token)}'
        )

Exemple #2

0

Afficher le fichier

 def __init__(self, root, docid, contents):
     self.root = root
     self.docid = docid
     t = Tokenizer.for_extension('.txt')(language=MockLang('english'))
     tokens = t.tokenize(MockCorpusFile(contents, self.docid),
                         MockConfig(type='mem'))
     self.doc = MockDocument(docid, tokens)
     self.docs = {docid: self.doc}

Exemple #3

0

Afficher le fichier

Fichier : pdf.py Projet : CopenhagenCityArchives/CorrectOCR

    def test_pdf_tokenization(self):
        t = Tokenizer.for_extension('.pdf')(language=MockLang('english'))

        f = pathlib.Path(__file__).parent.joinpath('test.pdf')
        tokens = t.tokenize(f, MockConfig(type='fs'))

        self.assertEqual(str(tokens), 'Once upen a ti- me.',
                         f'Resulting string does not contain expected tokens')

Exemple #4

0

Afficher le fichier

Fichier : hyphenation.py Projet : CopenhagenCityArchives/CorrectOCR

    def test_auto_dehyphenation_soft(self):
        t = Tokenizer.for_extension('.txt')(language=MockLang('english'))

        f = MockCorpusFile('Str\xad ing Te\xadst')
        tokens = t.tokenize(f, MockConfig(type='fs'))
        tokens.dehyphenate()

        self.assertEqual(
            str(tokens), 'String Te\xadst',
            f'Resulting string should be dehyphenated in {tokens}.')

Exemple #5

0

Afficher le fichier

Fichier : hyphenation.py Projet : CopenhagenCityArchives/CorrectOCR

    def test_manual_dehyphenation(self):
        t = Tokenizer.for_extension('.txt')(language=MockLang('english'))

        f = MockCorpusFile('Str- ing')
        tokens = t.tokenize(f, MockConfig(type='fs'))

        self.assertEqual(str(tokens), 'Str- ing',
                         f'Resulting string should not be dehyphenated.')

        tokens[0].is_hyphenated = True

        self.assertEqual(str(tokens), 'String',
                         f'Resulting string should be dehyphenated.')

Exemple #6

0

Afficher le fichier

Fichier : heuristics.py Projet : CopenhagenCityArchives/CorrectOCR

    def test_auto_dehyphenation(self):
        t = Tokenizer.for_extension('.txt')(language=MockLang('english'))

        f = MockCorpusFile('String')
        tokens = t.tokenize(f, MockConfig(type='fs'))
        token = tokens[0]

        self.assertIsNone(token.bin, f'Token should not be in any bin.')

        token.kbest = {
            1: KBestItem("String", 1.0),
        }

        dictionary = set(["String"])
        settings = {
            1: "o",
        }
        heuristics = Heuristics(settings, dictionary)

        heuristics.bin_tokens(tokens)

        self.assertEqual(token.bin.number, 1, f'Token should be in bin 1.')