def _loaded_aspell_line_manager(start_page, end_page): """ Returns loaded line manager with aspell spell checker.""" lang = get_lang() lm = line_manager.LineManager(spell_checker.AspellSpellChecker(lang), start_page, end_page) lm.load('text/clean') return lm
def run_gui(input_start_page, end_page, strict): """ Batch cleans the pages in text/clean.""" config = ConfigParser() config.read('book.cnf') if strict and \ config.has_option('process', 'last_strict_page'): hold_page = config.getint('process', 'last_strict_page') elif not strict and \ config.has_option('process', 'last_checked_page'): hold_page = config.getint('process', 'last_checked_page') else: hold_page = input_start_page print hold_page if input_start_page == 0: start_page = hold_page else: start_page = input_start_page lang = get_lang() lm = line_manager.LineManager( spell_checker.AspellSpellChecker(lang, './dict.{}.pws'.format(lang)), start_page, end_page) lm.load('text/clean') app = gui.main(lm, strict) lm.write_pages('text/clean', False) if strict and int(app.last_page) >= hold_page: config.set('process', 'last_strict_page', app.last_page) elif not strict and int(app.last_page) >= hold_page: config.set('process', 'last_checked_page', app.last_page) with open('book.cnf', 'wb') as f: config.write(f)
def run_dpgui(): lang = get_lang() lm = line_manager.LineManager( spell_checker.AspellSpellChecker(lang, './dict.{}.pws'.format(lang))) lm.load('text/clean') dpgui.main(lm) lm.write_pages('text/clean', False)
def write_html(): lm = line_manager.LineManager( spell_checker.AspellSpellChecker('en_GB', './dict.en.pws')) lm.load('text/clean') config = ConfigParser() with open('book.cnf', 'r') as cf: config.readfp(cf) lm.write_html(config)
def interactive_fix(start_page, end_page): lang = get_lang() lm = line_manager.LineManager( spell_checker.AspellSpellChecker(lang, './dict.{}.pws'.format(lang)), start_page, end_page) lm.load('text/clean') lm.interactive_fix() lm.write_pages('text/clean', False)
def clean(start_page, end_page): """ Batch cleans the pages in text/clean.""" config = ConfigParser() config.read('book.cnf') try: clean_headers = config.getboolean('process', 'clean_headers') except NoOptionError: clean_headers = True try: join_lines = config.getboolean('process', 'join_lines') except NoOptionError: join_lines = True if clean_headers: print 'cleaning headers' remove_headers() if not config.has_section('process'): config.add_section('process') config.set('process', 'clean_headers', 'false') with open('book.cnf', 'wb') as f: config.write(f) lm = _loaded_aspell_line_manager(start_page, end_page) lm.quick_fix() elif join_lines: print 'joining lines' if not config.has_section('process'): config.add_section('process') config.set('process', 'join_lines', 'false') with open('book.cnf', 'wb') as f: config.write(f) lm = _loaded_file_line_manager(start_page, end_page) lm.join_lines() else: # if interrupted by keyboard, go ahead and write changes lang = get_lang() # spell_checker.FileConfiguredSpellChecker(lang, './dict.{}.pws'.format(lang)), # spell_checker.AspellSpellChecker(lang, './dict.{}.pws'.format(lang)), lm = line_manager.LineManager( # spell_checker.AspellSpellChecker(lang, './dict.{}.pws'.format(lang)), spell_checker.FileConfiguredSpellChecker(lang), start_page, end_page) lm.load('text/clean') try: lm.fix_lines() except KeyboardInterrupt: pass lm.write_pages('text/clean', False)
def simple_clean(): """ Simple cleanup for testing algorithm. """ os.system('cp text/raw/* text/simple_clean/') lang = get_lang() checker = spell_checker.AspellSpellChecker(lang) # checker.fixer = spell_checker.SimpleEnglishSpellFixer() db = document_builder.SpellcheckDocMaker(checker) db.remove_possible_headers('text/simple_clean') lm = line_manager.LineManager( spell_checker.FileConfiguredSpellChecker(lang)) lm.load('text/simple_clean') lm.quick_fix() lm.join_lines() # lm.write_pages('text/simple_clean', False) # db.make_word_fix_doc('text/simple_clean') lm.write_pages('text/simple_clean', True)
def run_gui4(): """ Checks for proper noun problems.""" config = ConfigParser() config.read('book.cnf') if config.has_option('process', 'last_proper_page'): start_page = config.getint('process', 'last_proper_page') else: start_page = 0 lang = get_lang() lm = line_manager.LineManager(spell_checker.AspellSpellChecker(lang), start_page) lm.load('text/clean') app = gui4.main(lm) lm.write_pages('text/clean', False) last_page = int(app.last_page) if last_page >= start_page: config.set('process', 'last_proper_page', last_page) with open('book.cnf', 'wb') as f: config.write(f)
def run_gui3(): """ Batch cleans the pages in text/clean.""" config = ConfigParser() config.read('book.cnf') if config.has_option('process', 'last_html_page'): start_page = config.getint('process', 'last_html_page') else: start_page = 0 lang = get_lang() lm = line_manager.LineManager(spell_checker.AspellSpellChecker(lang), start_page) lm.load('text/clean') app = gui3.main(lm) last_page = int(app.last_html_page) last_line = app.last_html_line lm.write_html(config, int(last_page), int(last_line)) if last_page >= start_page: config.set('process', 'last_html_page', last_page) config.set('process', 'last_html_line', last_line) with open('book.cnf', 'wb') as f: config.write(f)
def check_if_ok(): lang = get_lang() lm = line_manager.LineManager(spell_checker.AspellSpellChecker(lang)) lm.load('text/clean') good = [] skipped = [] skipping = False with codecs.open('working/maybe_ok.txt', mode='rb', encoding='utf-8') as f: for l in f: if skipping: skipped.append(l) continue word = l.split()[0] page_nbr, line, line_info = lm.find_word(word) if line: print 'Page:', page_nbr print 'Word:', word print line.text if line_info: im = Image.open('images/pages/{}.pbm'.format(page_nbr)) im2 = line_info.image(im, 2) im2.save('test.jpg', 'jpeg') result = raw_input() if result == 'y': good.append(word) elif result == 's': skipped.append(l) elif result == 'q': skipping = True skipped.append(l) with codecs.open('working/good.txt', mode='ab', encoding='utf-8') as f: for g in good: f.write(u'{}\n'.format(g)) with codecs.open('working/maybe_ok.txt', mode='wb', encoding='utf-8') as f: for skip in skipped: f.write(skip)
def run_gui2(): """ Batch cleans the pages in text/clean.""" lang = get_lang() lm = line_manager.LineManager(spell_checker.AspellSpellChecker(lang)) lm.load('text/clean') gui2.main(lm)