def fix_spells(): """ Runs through the document, finds all the bad words, then tries to find fixed versions of them. """ lang = get_lang() checker = spell_checker.AspellSpellChecker(lang) db = document_builder.SpellcheckDocMaker(checker) db.make_word_fix_doc('text/clean')
def simple_clean(): """ Simple cleanup for testing algorithm. """ os.system('cp text/raw/* text/simple_clean/') lang = get_lang() checker = spell_checker.AspellSpellChecker(lang) # checker.fixer = spell_checker.SimpleEnglishSpellFixer() db = document_builder.SpellcheckDocMaker(checker) db.remove_possible_headers('text/simple_clean') lm = line_manager.LineManager( spell_checker.FileConfiguredSpellChecker(lang)) lm.load('text/simple_clean') lm.quick_fix() lm.join_lines() # lm.write_pages('text/simple_clean', False) # db.make_word_fix_doc('text/simple_clean') lm.write_pages('text/simple_clean', True)
def test_checkables(self): to_test = ( ('bad', 'company', [ 'badcompany', 'bacompany', ]), ('bad-', 'company', [ 'badcompany', ]), ("bad'", 'company', [ 'badcompany', ]), ('bad', 'Company', []), (u'ba\u00E0', 'company', [ u'ba\u00E0company', 'bacompany', ]), ('bad', u'\u00E0ompany', [ u'bad\u00E0ompany', u'ba\u00E0ompany', ]), ( 'ba', 'company', [], ), ( 'bad', 'com', [ 'badcom', 'bacom', ], ), ( 'bad', 'co', [], ), ) db = document_builder.SpellcheckDocMaker( spell_checker.StubSpellChecker([])) for word1, word2, expected in to_test: self.assertEqual(spell_checker.joinables(word1, word2), expected)
def test_fixed_words(self): sc = spell_checker.StubSpellChecker([ 'Cantrip', 'government', 'bomb', 'born', 'bod', "he'll", 'What', 'hiss', 'different', ]) db = document_builder.SpellcheckDocMaker(sc) for test, expected in test_expected( '{}/test_spellcheck/fix_spelling'.format(PATH)): self.assertEqual( db.fixed_words((test, )).values()[0], [ expected, ])
def remove_headers(): lang = get_lang() dict_ = './dict.{}.pws'.format(lang) checker = spell_checker.AspellSpellChecker(lang, dict_) db = document_builder.SpellcheckDocMaker(checker) db.remove_possible_headers('text/clean')
def cross_line_fixes(): lang = get_lang() checker = spell_checker.AspellSpellChecker(lang) db = document_builder.SpellcheckDocMaker(checker) db.make_line_join_doc('text/clean')
def proper_names(): lang = get_lang() dict_ = './dict.{}.pws'.format(lang) checker = spell_checker.AspellSpellChecker(lang, dict_) db = document_builder.SpellcheckDocMaker(checker) db.make_possible_proper_name_doc('text/clean')
def possible_headers(): lang = get_lang() checker = spell_checker.AspellSpellChecker(lang) db = document_builder.SpellcheckDocMaker(checker) db.possible_headers('text/raw')
def page_info(): lang = get_lang() checker = spell_checker.AspellSpellChecker(lang) db = document_builder.SpellcheckDocMaker(checker) db.page_image_info('text/raw', 'images/pages')