class InteractiveSpellchecker(object): def __init__(self): self.checker = enchant.checker.SpellChecker("en_US") self.cmdline_checker = CmdLineChecker() self.cmdline_checker.set_checker(self.checker) self.result = [] def process_text(self, text): """ accepts: [String] text input returns: [List] list of lower-case tokens with URLs filtered out """ try: del self.result[:] to_check = [] for (word,pos) in basic_tokenize(text): if '@' not in word and 'RT' not in word: to_check.append(word) tknzr = get_tokenizer("en_US",filters=[URLFilter]) return [word for (word,pos) in tknzr(' '.join(to_check))] except UnicodeEncodeError: pass def do_check(self,word): self.checker.set_text(word) self.cmdline_checker.run() correct = self.checker.get_text().lower() if '#' not in correct: self.result.extend(correct.split())
def do_check(checker,to_check): for text in to_check: checker.set_text(text) cmdline_checker = CmdLineChecker() cmdline_checker.set_checker(checker) cmdline_checker.run() to_check[to_check.index(text)] = checker.get_text()
def spell_check(text): """Spell checker.""" chkr.set_text(text) cmdln = CmdLineChecker() cmdln.set_checker(chkr) cmdln.run() return chkr.get_text()
class CmdLineSpellChecker(object): def __init__(self, language, pwl=None): if pwl: language = enchant.DictWithPWL(language, pwl) self._checker = _SpellChecker(lang=language, filters=filters_to_use) self.cmdln = CmdLineChecker() self.cmdln.set_checker(self._checker) def check(self, text): self._checker.set_text(text) self.cmdln.run() return self._checker.get_text()
def __init__(self, language, pwl=None): if pwl: language = enchant.DictWithPWL(language, pwl) self._checker = _SpellChecker(lang=language, filters=filters_to_use) self.cmdln = CmdLineChecker() self.cmdln.set_checker(self._checker)
def jupyterspellchecker(): parser = argparse.ArgumentParser(description='''Spell check a Jupyter/IPython notebook to a LaTeX file. Raw cells and markdown cells are spell checked in American English. ''') parser.add_argument('infile', help='path and filename of the input notebook file.') parser.add_argument('outfile', help='path and filename of the output file.') args = parser.parse_args() chkr = enchant.checker.SpellChecker("en_US", filters=[LatexCommandFilter]) cmdln = CmdLineChecker() cmdln.set_checker(chkr) with open(args.infile, 'r') as f: print('Parsing ', args.infile) ipynb = json.load(f) if 'cells' in ipynb: # newer versions of notebook cells = ipynb['cells'] else: # notebook format 1 cells = ipynb['worksheets'][0]['cells'] for cell in cells: if cell['cell_type'] in ['markdown', 'raw', 'heading']: for i, line in enumerate(cell['source']): chkr.set_text(line) cmdln.run() cell['source'][i] = chkr.get_text() with open(args.outfile, 'w') as f: print('Writing ', args.outfile) json.dump(ipynb, f) sys.exit()
def jupyterspellchecker(): parser = argparse.ArgumentParser( description='''Spell check a Jupyter/IPython notebook to a LaTeX file. Raw cells and markdown cells are spell checked in American English. ''') parser.add_argument('infile', help='path and filename of the input notebook file.') parser.add_argument('outfile', help='path and filename of the output file.') args = parser.parse_args() chkr = enchant.checker.SpellChecker("en_US", filters=[LatexCommandFilter]) cmdln = CmdLineChecker() cmdln.set_checker(chkr) with open(args.infile, 'r') as f: print('Parsing ', args.infile) ipynb = json.load(f) if 'cells' in ipynb: # newer versions of notebook cells = ipynb['cells'] else: # notebook format 1 cells = ipynb['worksheets'][0]['cells'] for cell in cells: if cell['cell_type'] in ['markdown', 'raw', 'heading']: for i, line in enumerate(cell['source']): chkr.set_text(line) cmdln.run() cell['source'][i] = chkr.get_text() with open(args.outfile, 'w') as f: print('Writing ', args.outfile) json.dump(ipynb, f) sys.exit()
def __init__(self): self.checker = enchant.checker.SpellChecker("en_US") self.cmdline_checker = CmdLineChecker() self.cmdline_checker.set_checker(self.checker) self.result = []
just in case something gets screwed up. ''' import re import json import sys import enchant import enchant.tokenize import enchant.checker from enchant.checker.CmdLineChecker import CmdLineChecker class LatexCommandFilter(enchant.tokenize.EmailFilter): _pattern = re.compile(r"\\([^a-zA-Z]|[a-zA-Z]+)") chkr = enchant.checker.SpellChecker("en_US", filters=[LatexCommandFilter]) cmdln = CmdLineChecker() cmdln.set_checker(chkr) with open(sys.argv[1], 'r') as f: print 'Parsing ', sys.argv[1] ipynb = json.load(f) for cell in ipynb['worksheets'][0]['cells']: if cell['cell_type'] in ['markdown', 'raw', 'heading']: for i, line in enumerate(cell['source']): chkr.set_text(line) cmdln.run() cell['source'][i] = chkr.get_text() with open(sys.argv[2], 'w') as f:
# Pretty simple command line spellchecker for NWN dialogs... # Note this requires PyEnchant and it's command line is some # what wonky. Type 'h' at the command prompt to get options # for correcting wor import enchant import enchant.checker from enchant.checker.CmdLineChecker import CmdLineChecker from pynwn.module import Module if __name__ == '__main__': # Using US english dictionary. chkr = enchant.checker.SpellChecker('en_US') cmdln = CmdLineChecker() cmdln.set_checker(chkr) mod = Module('test.mod') for dlg in mod.glob('*.dlg'): print(dlg.resref) for n in dlg.entries: if n.get_text(0) is None or len(n.get_text(0)) == 0: continue print n.get_text(0) chkr.set_text(n.get_text(0)) cmdln.run() n.set_text(0, chkr.get_text())