def _loaded_aspell_line_manager(start_page, end_page): """ Returns loaded line manager with aspell spell checker.""" lang = get_lang() lm = line_manager.LineManager(spell_checker.AspellSpellChecker(lang), start_page, end_page) lm.load('text/clean') return lm
def run_dpgui(): lang = get_lang() lm = line_manager.LineManager( spell_checker.AspellSpellChecker(lang, './dict.{}.pws'.format(lang))) lm.load('text/clean') dpgui.main(lm) lm.write_pages('text/clean', False)
def run_gui(input_start_page, end_page, strict): """ Batch cleans the pages in text/clean.""" config = ConfigParser() config.read('book.cnf') if strict and \ config.has_option('process', 'last_strict_page'): hold_page = config.getint('process', 'last_strict_page') elif not strict and \ config.has_option('process', 'last_checked_page'): hold_page = config.getint('process', 'last_checked_page') else: hold_page = input_start_page print hold_page if input_start_page == 0: start_page = hold_page else: start_page = input_start_page lang = get_lang() lm = line_manager.LineManager( spell_checker.AspellSpellChecker(lang, './dict.{}.pws'.format(lang)), start_page, end_page) lm.load('text/clean') app = gui.main(lm, strict) lm.write_pages('text/clean', False) if strict and int(app.last_page) >= hold_page: config.set('process', 'last_strict_page', app.last_page) elif not strict and int(app.last_page) >= hold_page: config.set('process', 'last_checked_page', app.last_page) with open('book.cnf', 'wb') as f: config.write(f)
def interactive_fix(start_page, end_page): lang = get_lang() lm = line_manager.LineManager( spell_checker.AspellSpellChecker(lang, './dict.{}.pws'.format(lang)), start_page, end_page) lm.load('text/clean') lm.interactive_fix() lm.write_pages('text/clean', False)
def write_html(): lm = line_manager.LineManager( spell_checker.AspellSpellChecker('en_GB', './dict.en.pws')) lm.load('text/clean') config = ConfigParser() with open('book.cnf', 'r') as cf: config.readfp(cf) lm.write_html(config)
def fix_spells(): """ Runs through the document, finds all the bad words, then tries to find fixed versions of them. """ lang = get_lang() checker = spell_checker.AspellSpellChecker(lang) db = document_builder.SpellcheckDocMaker(checker) db.make_word_fix_doc('text/clean')
def test_aspell(self): with open('/dev/null', 'wb') as f: if not subprocess.call([ 'which', 'aspell', ], stdout=f, stderr=f): sc = spell_checker.AspellSpellChecker('en_US') self.assertEqual( [ 'afeve', 'brff', ], sorted( sc.check_line( 'what brff needs is an afeve in the car.')))
def aspell_run(start, end): lang = get_lang() checker = spell_checker.AspellSpellChecker(lang) for fn in sorted(os.listdir('text/clean'), key=lambda x: int(os.path.splitext(x)[0])): basename, ext = os.path.splitext(fn) if int(basename) < start or ext != '.txt': continue if int(basename) > end: return command = checker.interactive_check('text/clean/{}'.format(fn)) os.system(' '.join(command)) os.system('rm text/clean/*.bak')
def simple_clean(): """ Simple cleanup for testing algorithm. """ os.system('cp text/raw/* text/simple_clean/') lang = get_lang() checker = spell_checker.AspellSpellChecker(lang) # checker.fixer = spell_checker.SimpleEnglishSpellFixer() db = document_builder.SpellcheckDocMaker(checker) db.remove_possible_headers('text/simple_clean') lm = line_manager.LineManager( spell_checker.FileConfiguredSpellChecker(lang)) lm.load('text/simple_clean') lm.quick_fix() lm.join_lines() # lm.write_pages('text/simple_clean', False) # db.make_word_fix_doc('text/simple_clean') lm.write_pages('text/simple_clean', True)
def aspell_clean(): starts_with_cap = re.compile('^[A-Z]') lang = get_lang() checker = spell_checker.AspellSpellChecker(lang) bad_word_ctr = Counter() for fn in os.listdir('text/clean'): if fn.endswith('.txt'): for bad_word in checker.check_document('text/clean/{}'.format(fn)): bad_word_ctr[bad_word] += 1 with codecs.open('working/maybe_ok.txt', mode='wb', encoding='utf-8') as f: for bad_word, count in bad_word_ctr.most_common(): if starts_with_cap.match(bad_word): f.write(u'{} ({})\n'.format(spell_checker._decode(bad_word), count)) for bad_word, count in bad_word_ctr.most_common(): if not starts_with_cap.match(bad_word): f.write(u'{} ({})\n'.format(spell_checker._decode(bad_word), count))
def run_gui4(): """ Checks for proper noun problems.""" config = ConfigParser() config.read('book.cnf') if config.has_option('process', 'last_proper_page'): start_page = config.getint('process', 'last_proper_page') else: start_page = 0 lang = get_lang() lm = line_manager.LineManager(spell_checker.AspellSpellChecker(lang), start_page) lm.load('text/clean') app = gui4.main(lm) lm.write_pages('text/clean', False) last_page = int(app.last_page) if last_page >= start_page: config.set('process', 'last_proper_page', last_page) with open('book.cnf', 'wb') as f: config.write(f)
def run_gui3(): """ Batch cleans the pages in text/clean.""" config = ConfigParser() config.read('book.cnf') if config.has_option('process', 'last_html_page'): start_page = config.getint('process', 'last_html_page') else: start_page = 0 lang = get_lang() lm = line_manager.LineManager(spell_checker.AspellSpellChecker(lang), start_page) lm.load('text/clean') app = gui3.main(lm) last_page = int(app.last_html_page) last_line = app.last_html_line lm.write_html(config, int(last_page), int(last_line)) if last_page >= start_page: config.set('process', 'last_html_page', last_page) config.set('process', 'last_html_line', last_line) with open('book.cnf', 'wb') as f: config.write(f)
def check_if_ok(): lang = get_lang() lm = line_manager.LineManager(spell_checker.AspellSpellChecker(lang)) lm.load('text/clean') good = [] skipped = [] skipping = False with codecs.open('working/maybe_ok.txt', mode='rb', encoding='utf-8') as f: for l in f: if skipping: skipped.append(l) continue word = l.split()[0] page_nbr, line, line_info = lm.find_word(word) if line: print 'Page:', page_nbr print 'Word:', word print line.text if line_info: im = Image.open('images/pages/{}.pbm'.format(page_nbr)) im2 = line_info.image(im, 2) im2.save('test.jpg', 'jpeg') result = raw_input() if result == 'y': good.append(word) elif result == 's': skipped.append(l) elif result == 'q': skipping = True skipped.append(l) with codecs.open('working/good.txt', mode='ab', encoding='utf-8') as f: for g in good: f.write(u'{}\n'.format(g)) with codecs.open('working/maybe_ok.txt', mode='wb', encoding='utf-8') as f: for skip in skipped: f.write(skip)
def page_info(): lang = get_lang() checker = spell_checker.AspellSpellChecker(lang) db = document_builder.SpellcheckDocMaker(checker) db.page_image_info('text/raw', 'images/pages')
def proper_names(): lang = get_lang() dict_ = './dict.{}.pws'.format(lang) checker = spell_checker.AspellSpellChecker(lang, dict_) db = document_builder.SpellcheckDocMaker(checker) db.make_possible_proper_name_doc('text/clean')
def cross_line_fixes(): lang = get_lang() checker = spell_checker.AspellSpellChecker(lang) db = document_builder.SpellcheckDocMaker(checker) db.make_line_join_doc('text/clean')
def run_gui2(): """ Batch cleans the pages in text/clean.""" lang = get_lang() lm = line_manager.LineManager(spell_checker.AspellSpellChecker(lang)) lm.load('text/clean') gui2.main(lm)
def remove_headers(): lang = get_lang() dict_ = './dict.{}.pws'.format(lang) checker = spell_checker.AspellSpellChecker(lang, dict_) db = document_builder.SpellcheckDocMaker(checker) db.remove_possible_headers('text/clean')
def possible_headers(): lang = get_lang() checker = spell_checker.AspellSpellChecker(lang) db = document_builder.SpellcheckDocMaker(checker) db.possible_headers('text/raw')