Esempio n. 1
0
def _loaded_aspell_line_manager(start_page, end_page):
    """ Returns loaded line manager with aspell spell checker."""
    lang = get_lang()
    lm = line_manager.LineManager(spell_checker.AspellSpellChecker(lang),
                                  start_page, end_page)
    lm.load('text/clean')
    return lm
Esempio n. 2
0
def run_dpgui():
    lang = get_lang()
    lm = line_manager.LineManager(
        spell_checker.AspellSpellChecker(lang, './dict.{}.pws'.format(lang)))
    lm.load('text/clean')
    dpgui.main(lm)
    lm.write_pages('text/clean', False)
Esempio n. 3
0
def run_gui(input_start_page, end_page, strict):
    """ Batch cleans the pages in text/clean."""
    config = ConfigParser()
    config.read('book.cnf')
    if strict and \
        config.has_option('process', 'last_strict_page'):
        hold_page = config.getint('process', 'last_strict_page')
    elif not strict and \
        config.has_option('process', 'last_checked_page'):
        hold_page = config.getint('process', 'last_checked_page')
    else:
        hold_page = input_start_page
    print hold_page
    if input_start_page == 0:
        start_page = hold_page
    else:
        start_page = input_start_page
    lang = get_lang()
    lm = line_manager.LineManager(
        spell_checker.AspellSpellChecker(lang, './dict.{}.pws'.format(lang)),
        start_page, end_page)
    lm.load('text/clean')
    app = gui.main(lm, strict)
    lm.write_pages('text/clean', False)

    if strict and int(app.last_page) >= hold_page:
        config.set('process', 'last_strict_page', app.last_page)
    elif not strict and int(app.last_page) >= hold_page:
        config.set('process', 'last_checked_page', app.last_page)
    with open('book.cnf', 'wb') as f:
        config.write(f)
Esempio n. 4
0
def interactive_fix(start_page, end_page):
    lang = get_lang()
    lm = line_manager.LineManager(
        spell_checker.AspellSpellChecker(lang, './dict.{}.pws'.format(lang)),
        start_page, end_page)
    lm.load('text/clean')
    lm.interactive_fix()
    lm.write_pages('text/clean', False)
Esempio n. 5
0
def write_html():
    lm = line_manager.LineManager(
        spell_checker.AspellSpellChecker('en_GB', './dict.en.pws'))
    lm.load('text/clean')
    config = ConfigParser()
    with open('book.cnf', 'r') as cf:
        config.readfp(cf)
    lm.write_html(config)
Esempio n. 6
0
def fix_spells():
    """ Runs through the document, finds all the bad words, then 
    tries to find fixed versions of them.
    """
    lang = get_lang()
    checker = spell_checker.AspellSpellChecker(lang)

    db = document_builder.SpellcheckDocMaker(checker)
    db.make_word_fix_doc('text/clean')
 def test_aspell(self):
     with open('/dev/null', 'wb') as f:
         if not subprocess.call([
                 'which',
                 'aspell',
         ], stdout=f, stderr=f):
             sc = spell_checker.AspellSpellChecker('en_US')
             self.assertEqual(
                 [
                     'afeve',
                     'brff',
                 ],
                 sorted(
                     sc.check_line(
                         'what brff needs is an afeve in the car.')))
Esempio n. 8
0
def aspell_run(start, end):
    lang = get_lang()
    checker = spell_checker.AspellSpellChecker(lang)
    for fn in sorted(os.listdir('text/clean'),
                     key=lambda x: int(os.path.splitext(x)[0])):

        basename, ext = os.path.splitext(fn)
        if int(basename) < start or ext != '.txt':
            continue

        if int(basename) > end:
            return
        command = checker.interactive_check('text/clean/{}'.format(fn))
        os.system(' '.join(command))
    os.system('rm text/clean/*.bak')
Esempio n. 9
0
def simple_clean():
    """ Simple cleanup for testing algorithm. """
    os.system('cp text/raw/* text/simple_clean/')
    lang = get_lang()
    checker = spell_checker.AspellSpellChecker(lang)
    #   checker.fixer = spell_checker.SimpleEnglishSpellFixer()
    db = document_builder.SpellcheckDocMaker(checker)
    db.remove_possible_headers('text/simple_clean')
    lm = line_manager.LineManager(
        spell_checker.FileConfiguredSpellChecker(lang))
    lm.load('text/simple_clean')
    lm.quick_fix()
    lm.join_lines()
    #   lm.write_pages('text/simple_clean', False)
    #   db.make_word_fix_doc('text/simple_clean')
    lm.write_pages('text/simple_clean', True)
Esempio n. 10
0
def aspell_clean():

    starts_with_cap = re.compile('^[A-Z]')
    lang = get_lang()
    checker = spell_checker.AspellSpellChecker(lang)
    bad_word_ctr = Counter()
    for fn in os.listdir('text/clean'):
        if fn.endswith('.txt'):
            for bad_word in checker.check_document('text/clean/{}'.format(fn)):
                bad_word_ctr[bad_word] += 1
    with codecs.open('working/maybe_ok.txt', mode='wb', encoding='utf-8') as f:
        for bad_word, count in bad_word_ctr.most_common():
            if starts_with_cap.match(bad_word):
                f.write(u'{} ({})\n'.format(spell_checker._decode(bad_word),
                                            count))
        for bad_word, count in bad_word_ctr.most_common():
            if not starts_with_cap.match(bad_word):
                f.write(u'{} ({})\n'.format(spell_checker._decode(bad_word),
                                            count))
Esempio n. 11
0
def run_gui4():
    """ Checks for proper noun problems."""
    config = ConfigParser()
    config.read('book.cnf')
    if config.has_option('process', 'last_proper_page'):
        start_page = config.getint('process', 'last_proper_page')
    else:
        start_page = 0
    lang = get_lang()
    lm = line_manager.LineManager(spell_checker.AspellSpellChecker(lang),
                                  start_page)
    lm.load('text/clean')
    app = gui4.main(lm)
    lm.write_pages('text/clean', False)
    last_page = int(app.last_page)
    if last_page >= start_page:
        config.set('process', 'last_proper_page', last_page)
    with open('book.cnf', 'wb') as f:
        config.write(f)
Esempio n. 12
0
def run_gui3():
    """ Batch cleans the pages in text/clean."""
    config = ConfigParser()
    config.read('book.cnf')
    if config.has_option('process', 'last_html_page'):
        start_page = config.getint('process', 'last_html_page')
    else:
        start_page = 0
    lang = get_lang()
    lm = line_manager.LineManager(spell_checker.AspellSpellChecker(lang),
                                  start_page)
    lm.load('text/clean')
    app = gui3.main(lm)
    last_page = int(app.last_html_page)
    last_line = app.last_html_line
    lm.write_html(config, int(last_page), int(last_line))
    if last_page >= start_page:
        config.set('process', 'last_html_page', last_page)
        config.set('process', 'last_html_line', last_line)
    with open('book.cnf', 'wb') as f:
        config.write(f)
Esempio n. 13
0
def check_if_ok():
    lang = get_lang()
    lm = line_manager.LineManager(spell_checker.AspellSpellChecker(lang))
    lm.load('text/clean')
    good = []
    skipped = []
    skipping = False
    with codecs.open('working/maybe_ok.txt', mode='rb', encoding='utf-8') as f:
        for l in f:
            if skipping:
                skipped.append(l)
                continue
            word = l.split()[0]
            page_nbr, line, line_info = lm.find_word(word)
            if line:
                print 'Page:', page_nbr
                print 'Word:', word
                print line.text
                if line_info:
                    im = Image.open('images/pages/{}.pbm'.format(page_nbr))
                    im2 = line_info.image(im, 2)
                    im2.save('test.jpg', 'jpeg')
                result = raw_input()
                if result == 'y':
                    good.append(word)
                elif result == 's':
                    skipped.append(l)
                elif result == 'q':
                    skipping = True
                    skipped.append(l)

    with codecs.open('working/good.txt', mode='ab', encoding='utf-8') as f:
        for g in good:
            f.write(u'{}\n'.format(g))

    with codecs.open('working/maybe_ok.txt', mode='wb', encoding='utf-8') as f:
        for skip in skipped:
            f.write(skip)
Esempio n. 14
0
def page_info():
    lang = get_lang()
    checker = spell_checker.AspellSpellChecker(lang)
    db = document_builder.SpellcheckDocMaker(checker)
    db.page_image_info('text/raw', 'images/pages')
Esempio n. 15
0
def proper_names():
    lang = get_lang()
    dict_ = './dict.{}.pws'.format(lang)
    checker = spell_checker.AspellSpellChecker(lang, dict_)
    db = document_builder.SpellcheckDocMaker(checker)
    db.make_possible_proper_name_doc('text/clean')
Esempio n. 16
0
def cross_line_fixes():
    lang = get_lang()
    checker = spell_checker.AspellSpellChecker(lang)
    db = document_builder.SpellcheckDocMaker(checker)
    db.make_line_join_doc('text/clean')
Esempio n. 17
0
def run_gui2():
    """ Batch cleans the pages in text/clean."""
    lang = get_lang()
    lm = line_manager.LineManager(spell_checker.AspellSpellChecker(lang))
    lm.load('text/clean')
    gui2.main(lm)
Esempio n. 18
0
def remove_headers():
    lang = get_lang()
    dict_ = './dict.{}.pws'.format(lang)
    checker = spell_checker.AspellSpellChecker(lang, dict_)
    db = document_builder.SpellcheckDocMaker(checker)
    db.remove_possible_headers('text/clean')
Esempio n. 19
0
def possible_headers():
    lang = get_lang()
    checker = spell_checker.AspellSpellChecker(lang)
    db = document_builder.SpellcheckDocMaker(checker)
    db.possible_headers('text/raw')