Beispiel #1
0
    def spell_check_split_words(self, indices, doc, language, register=-1):
        """
        Split misspelled words based on spell-checker suggestions.

        Using this is usually not a good idea unless you have an insane
        dictionary that contains all possible compound words in `language`.
        Raise :exc:`enchant.Error` if dictionary instatiation fails.
        """
        new_indices = []
        new_texts = []
        re_multispace = re.compile(r" +")
        checker = self._get_enchant_checker(language)
        indices = indices or self.get_all_indices()
        for index in indices:
            subtitle = self.subtitles[index]
            text = subtitle.get_text(doc)
            text = re_multispace.sub(" ", text)
            checker.set_text(text)
            while True:
                try:
                    next(checker)
                except StopIteration:
                    break
                if checker.word.capitalize() == checker.word:
                    # Skip capitalized words, which are usually names
                    # and thus not always found in dictionaries.
                    continue
                suggestions = []
                for i, suggestion in enumerate(checker.suggest()):
                    if suggestion.find(" ") > 0:
                        if suggestion.replace(" ", "") == checker.word:
                            suggestions.append(suggestion)
                # Split word only if only one two-word suggestion found that
                # has all the same characters as the original unsplit word.
                if len(suggestions) != 1: continue
                text = checker.get_text()
                a = checker.wordpos
                z = checker.wordpos + len(checker.word)
                checker.set_text(text[:a] + suggestions[0] + text[z:])
            new_text = checker.get_text()
            if new_text != text:
                new_indices.append(index)
                new_texts.append(new_text)
        if not new_indices: return
        self.replace_texts(new_indices, doc, new_texts, register=register)
        description = _("Splitting words by spell-check suggestions")
        self.set_action_description(register, description)
Beispiel #2
0
    def spell_check_split_words(self, indices, doc, language, register=-1):
        """
        Split misspelled words based on spell-checker suggestions.

        Using this is usually not a good idea unless you have an insane
        dictionary that contains all possible compound words in `language`.
        Raise :exc:`enchant.Error` if dictionary instatiation fails.
        """
        new_indices = []
        new_texts = []
        re_multispace = re.compile(r" +")
        checker = self._get_enchant_checker(language)
        indices = indices or self.get_all_indices()
        for index in indices:
            subtitle = self.subtitles[index]
            text = subtitle.get_text(doc)
            text = re_multispace.sub(" ", text)
            checker.set_text(text)
            while True:
                try: next(checker)
                except StopIteration: break
                if checker.word.capitalize() == checker.word:
                    # Skip capitalized words, which are usually names
                    # and thus not always found in dictionaries.
                    continue
                suggestions = []
                for i, suggestion in enumerate(checker.suggest()):
                    if suggestion.find(" ") > 0:
                        if suggestion.replace(" ", "") == checker.word:
                            suggestions.append(suggestion)
                # Split word only if only one two-word suggestion found that
                # has all the same characters as the original unsplit word.
                if len(suggestions) != 1: continue
                text = checker.get_text()
                a = checker.wordpos
                z = checker.wordpos + len(checker.word)
                checker.set_text(text[:a] + suggestions[0] + text[z:])
            new_text = checker.get_text()
            if new_text != text:
                new_indices.append(index)
                new_texts.append(new_text)
        if not new_indices: return
        self.replace_texts(new_indices, doc, new_texts, register=register)
        description = _("Splitting words by spell-check suggestions")
        self.set_action_description(register, description)
Beispiel #3
0
def spell_check(pkg, str, fmt, lang, ignored):

    dict_found = True
    warned = set()
    if enchant:
        if lang == 'C':
            lang = 'en_US'

        checker = _enchant_checkers.get(lang)
        if not checker and lang not in _enchant_checkers:
            try:
                checker = enchant.checker.SpellChecker(
                    lang, filters=[enchant.tokenize.EmailFilter,
                                   enchant.tokenize.URLFilter,
                                   enchant.tokenize.WikiWordFilter])
            except enchant.DictNotFoundError:
                printInfo(pkg, 'enchant-dictionary-not-found', lang)
                pass
            _enchant_checkers[lang] = checker

        if checker:
            # squeeze whitespace to ease leading context check
            checker.set_text(re.sub(r'\s+', ' ', str))
            if use_utf8:
                uppername = Pkg.to_unicode(pkg.header[rpm.RPMTAG_NAME]).upper()
            else:
                uppername = pkg.name.upper()
            upperparts = uppername.split('-')
            if lang.startswith('en'):
                ups = [x + "'S" for x in upperparts]
                upperparts.extend(ups)
            for err in checker:

                # Skip already warned and ignored words
                if err.word in warned or err.word in ignored:
                    continue

                # Skip all capitalized words that do not start a sentence
                if err.word[0].isupper() and not \
                        sentence_break_regex.search(checker.leading_context(3)):
                    continue

                upperword = err.word.upper()

                # Skip all uppercase words
                if err.word == upperword:
                    continue

                # Skip errors containing package name or equal to a
                # "component" of it, case insensitively
                if uppername in upperword or upperword in upperparts:
                    continue

                # Work around enchant's digit tokenizing behavior:
                # http://github.com/rfk/pyenchant/issues/issue/3
                if checker.leading_context(1).isdigit() or \
                        checker.trailing_context(1).isdigit():
                    continue

                # Warn and suggest
                sug = ', '.join(checker.suggest()[:3])
                if sug:
                    sug = '-> %s' % sug
                printWarning(pkg, 'spelling-error', fmt % lang, err.word, sug)
                warned.add(err.word)

        else:
            dict_found = False

    if not enchant or not dict_found:
        for seq in str.split():
            for word in re.split(r'[^a-z]+', seq.lower()):
                if len(word) == 0:
                    continue
                correct = BAD_WORDS.get(word)
                if not correct:
                    continue
                if word[0] == '\'':
                    word = word[1:]
                if word[-1] == '\'':
                    word = word[:-1]
                if word in warned or word in ignored:
                    continue
                printWarning(pkg, 'spelling-error', fmt % lang, word, '->',
                             correct)
                warned.add(word)
Beispiel #4
0
def spell_check(pkg, str, fmt, lang, ignored):

    dict_found = True
    warned = set()
    if enchant:
        if lang == 'C':
            lang = 'en_US'

        checker = _enchant_checkers.get(lang)
        if not checker and lang not in _enchant_checkers:
            try:
                checker = enchant.checker.SpellChecker(
                    lang, filters=[enchant.tokenize.EmailFilter,
                                   enchant.tokenize.URLFilter,
                                   enchant.tokenize.WikiWordFilter])
            except enchant.DictNotFoundError:
                printInfo(pkg, 'enchant-dictionary-not-found', lang)
                pass
            _enchant_checkers[lang] = checker

        if checker:
            # squeeze whitespace to ease leading context check
            checker.set_text(re.sub(r'\s+', ' ', str))
            if use_utf8:
                uppername = Pkg.to_unicode(pkg.header[rpm.RPMTAG_NAME]).upper()
            else:
                uppername = pkg.name.upper()
            upperparts = uppername.split('-')
            if lang.startswith('en'):
                ups = [x + "'S" for x in upperparts]
                upperparts.extend(ups)
            for err in checker:

                # Skip already warned and ignored words
                if err.word in warned or err.word in ignored:
                    continue

                # Skip all capitalized words that do not start a sentence
                if err.word[0].isupper() and not \
                        sentence_break_regex.search(checker.leading_context(3)):
                    continue

                upperword = err.word.upper()

                # Skip all uppercase words
                if err.word == upperword:
                    continue

                # Skip errors containing package name or equal to a
                # "component" of it, case insensitively
                if uppername in upperword or upperword in upperparts:
                    continue

                # Work around enchant's digit tokenizing behavior:
                # http://github.com/rfk/pyenchant/issues/issue/3
                if checker.leading_context(1).isdigit() or \
                        checker.trailing_context(1).isdigit():
                    continue

                # Warn and suggest
                sug = ', '.join(checker.suggest()[:3])
                if sug:
                    sug = '-> %s' % sug
                printWarning(pkg, 'spelling-error', fmt % lang, err.word, sug)
                warned.add(err.word)

        else:
            dict_found = False

    if not enchant or not dict_found:
        for seq in str.split():
            for word in re.split('[^a-z]+', seq.lower()):
                if len(word) == 0:
                    continue
                correct = BAD_WORDS.get(word)
                if not correct:
                    continue
                if word[0] == '\'':
                    word = word[1:]
                if word[-1] == '\'':
                    word = word[:-1]
                if word in warned or word in ignored:
                    continue
                printWarning(pkg, 'spelling-error', fmt % lang, word, '->',
                             correct)
                warned.add(word)