def spell_check_join_words(self, indices, doc, language, register=-1): """ Join misspelled words based on spell-checker suggestions. Raise :exc:`enchant.Error` if dictionary instatiation fails. """ new_indices = [] new_texts = [] re_multispace = re.compile(r" +") checker = self._get_enchant_checker(language) seeker = self._get_enchant_checker(language) for index in indices or self.get_all_indices(): subtitle = self.subtitles[index] text = subtitle.get_text(doc) text = re_multispace.sub(" ", text) checker.set_text(text) while True: try: next(checker) except StopIteration: break text = checker.get_text() a = checker.wordpos z = checker.wordpos + len(checker.word) ok_with_prev = ok_with_next = False if checker.leading_context(1) == " ": seeker.set_text(text[:a - 1] + text[a:]) poss = self._get_misspelled_indices(seeker) ok_with_prev = not (a - 1) in poss if checker.trailing_context(1) == " ": seeker.set_text(text[:z] + text[z + 1:]) poss = self._get_misspelled_indices(seeker) ok_with_next = not a in poss # Join backwards or forwards if only one direction, # but not both, produce a correctly spelled result. if ok_with_prev and not ok_with_next: checker.set_text(text[:a - 1] + text[a:]) if ok_with_next and not ok_with_prev: checker.set_text(text[:z] + text[z + 1:]) new_text = checker.get_text() if new_text != text: new_indices.append(index) new_texts.append(new_text) if not new_indices: return self.replace_texts(new_indices, doc, new_texts, register=register) description = _("Joining words by spell-check suggestions") self.set_action_description(register, description)
def spell_check(pkg, str, fmt, lang, ignored): dict_found = True warned = set() if enchant: if lang == 'C': lang = 'en_US' checker = _enchant_checkers.get(lang) if not checker and lang not in _enchant_checkers: try: checker = enchant.checker.SpellChecker( lang, filters=[enchant.tokenize.EmailFilter, enchant.tokenize.URLFilter, enchant.tokenize.WikiWordFilter]) except enchant.DictNotFoundError: printInfo(pkg, 'enchant-dictionary-not-found', lang) pass _enchant_checkers[lang] = checker if checker: # squeeze whitespace to ease leading context check checker.set_text(re.sub(r'\s+', ' ', str)) if use_utf8: uppername = Pkg.to_unicode(pkg.header[rpm.RPMTAG_NAME]).upper() else: uppername = pkg.name.upper() upperparts = uppername.split('-') if lang.startswith('en'): ups = [x + "'S" for x in upperparts] upperparts.extend(ups) for err in checker: # Skip already warned and ignored words if err.word in warned or err.word in ignored: continue # Skip all capitalized words that do not start a sentence if err.word[0].isupper() and not \ sentence_break_regex.search(checker.leading_context(3)): continue upperword = err.word.upper() # Skip all uppercase words if err.word == upperword: continue # Skip errors containing package name or equal to a # "component" of it, case insensitively if uppername in upperword or upperword in upperparts: continue # Work around enchant's digit tokenizing behavior: # http://github.com/rfk/pyenchant/issues/issue/3 if checker.leading_context(1).isdigit() or \ checker.trailing_context(1).isdigit(): continue # Warn and suggest sug = ', '.join(checker.suggest()[:3]) if sug: sug = '-> %s' % sug printWarning(pkg, 'spelling-error', fmt % lang, err.word, sug) warned.add(err.word) else: dict_found = False if not enchant or not dict_found: for seq in str.split(): for word in re.split(r'[^a-z]+', seq.lower()): if len(word) == 0: continue correct = BAD_WORDS.get(word) if not correct: continue if word[0] == '\'': word = word[1:] if word[-1] == '\'': word = word[:-1] if word in warned or word in ignored: continue printWarning(pkg, 'spelling-error', fmt % lang, word, '->', correct) warned.add(word)
def spell_check(pkg, str, fmt, lang, ignored): dict_found = True warned = set() if enchant: if lang == 'C': lang = 'en_US' checker = _enchant_checkers.get(lang) if not checker and lang not in _enchant_checkers: try: checker = enchant.checker.SpellChecker( lang, filters=[enchant.tokenize.EmailFilter, enchant.tokenize.URLFilter, enchant.tokenize.WikiWordFilter]) except enchant.DictNotFoundError: printInfo(pkg, 'enchant-dictionary-not-found', lang) pass _enchant_checkers[lang] = checker if checker: # squeeze whitespace to ease leading context check checker.set_text(re.sub(r'\s+', ' ', str)) if use_utf8: uppername = Pkg.to_unicode(pkg.header[rpm.RPMTAG_NAME]).upper() else: uppername = pkg.name.upper() upperparts = uppername.split('-') if lang.startswith('en'): ups = [x + "'S" for x in upperparts] upperparts.extend(ups) for err in checker: # Skip already warned and ignored words if err.word in warned or err.word in ignored: continue # Skip all capitalized words that do not start a sentence if err.word[0].isupper() and not \ sentence_break_regex.search(checker.leading_context(3)): continue upperword = err.word.upper() # Skip all uppercase words if err.word == upperword: continue # Skip errors containing package name or equal to a # "component" of it, case insensitively if uppername in upperword or upperword in upperparts: continue # Work around enchant's digit tokenizing behavior: # http://github.com/rfk/pyenchant/issues/issue/3 if checker.leading_context(1).isdigit() or \ checker.trailing_context(1).isdigit(): continue # Warn and suggest sug = ', '.join(checker.suggest()[:3]) if sug: sug = '-> %s' % sug printWarning(pkg, 'spelling-error', fmt % lang, err.word, sug) warned.add(err.word) else: dict_found = False if not enchant or not dict_found: for seq in str.split(): for word in re.split('[^a-z]+', seq.lower()): if len(word) == 0: continue correct = BAD_WORDS.get(word) if not correct: continue if word[0] == '\'': word = word[1:] if word[-1] == '\'': word = word[:-1] if word in warned or word in ignored: continue printWarning(pkg, 'spelling-error', fmt % lang, word, '->', correct) warned.add(word)