Example #1
0
def initialize(): 
	chk_file = raw_input("What is the file to spellcheck?    ")
	field = raw_input("What FIELD do you want to spellcheck?   ")
	s_file = raw_input("What is name of final file?    ")

	checker = enchant.checker.SpellChecker("en_US")
	cmdln = CmdLineChecker()

	file_data = pd.read_csv(chk_file)

	fields = list(file_data.apply(lambda x:'%s' % (x[field]),axis=1))

	# maybe i don't even need this...
	#fields = strip_html(fields)

	corrected_text = []
	for data_field in fields:
		checker.set_text(str(data_field))
		for err in checker:
			print err.word
			print err.suggest()
			correct = raw_input("provide 0-index int of correct word or i to ignore, e to edit ")
			if correct == 'i':
				pass
			elif correct == 'e':
				suggest = raw_input("")
				err.replace(suggest)
			else:
				correct = int(correct)
				suggest = err.suggest()[correct]
				err.replace(suggest)
		corrected_text.append(checker.get_text())

	saved_file = write_fixed_file(corrected_text, s_file)
def do_check(checker,to_check):
    for text in to_check:
        checker.set_text(text)
        cmdline_checker = CmdLineChecker()
        cmdline_checker.set_checker(checker)
        cmdline_checker.run()
        to_check[to_check.index(text)] = checker.get_text()
Example #3
0
    def spellcheck(self, text, tld=''):
        from scanner.models import BadWord
        # guess language code
        self.log.debug('    * guessing language...')
        #lang_code, lang_num, lang_name = guess_language.guessLanguageInfo(text)
        lang_name, lang_code, reliable, bytes_found, details = \
            cld.detect(text.encode('utf-8'), hintTopLevelDomain=tld)
        self.log.debug('    -> detected lang: %s (%s)' %
                       (lang_name, lang_code))

        if lang_code.upper() == 'UNKNOWN' or lang_name.upper(
        ) == 'UNKNOWN' or not reliable:
            self.log.warning(
                '    -> Cannot detect language of page - end : %s' % details)
            return None, set()

        self.log.debug('    * searching for dictionary')
        try:
            checker = enchant.checker.SpellChecker(
                lang_code,
                filters=[
                    EmailFilter,
                    URLFilter,
                    #  BetterURLFilter,
                ])
        except enchant.DictNotFoundError:
            if lang_code in self.not_supported_lang:
                self.log.debug(
                    "    -> Cannot find language for spellchecker for %s - end (blacklisted)"
                    % lang_code)
            else:
                self.log.error(
                    "    -> Cannot find language for spellchecker for %s - end"
                    % lang_code)
            return None, set()

        # checking page for bad words
        self.log.debug('    * check spelling...')
        checker.set_text(text)
        self.log.debug('    -> ok')

        self.log.debug('    * get errors...')
        errors = [er.word for er in checker if len(er.word) < 128]
        self.log.debug('    -> ok')

        self.log.debug('      * found %d bad words and adding them to DB' %
                       len(errors))
        BadWord.objects.bulk_create(
            [BadWord(word=bad_word.strip().lower()) for bad_word in errors])
        self.log.debug('      -> ok')

        self.log.debug('     * call filtering bad words')
        errors = BadWord.filter_bad_words(errors)
        self.log.debug('      -> ok')

        self.log.debug('     * after filtering out there is %d errors (%s)' %
                       (len(errors), errors))

        return lang_name, set(errors)
Example #4
0
    def spell_check_split_words(self, indices, doc, language, register=-1):
        """
        Split misspelled words based on spell-checker suggestions.

        Using this is usually not a good idea unless you have an insane
        dictionary that contains all possible compound words in `language`.
        Raise :exc:`enchant.Error` if dictionary instatiation fails.
        """
        new_indices = []
        new_texts = []
        re_multispace = re.compile(r" +")
        checker = self._get_enchant_checker(language)
        indices = indices or self.get_all_indices()
        for index in indices:
            subtitle = self.subtitles[index]
            text = subtitle.get_text(doc)
            text = re_multispace.sub(" ", text)
            checker.set_text(text)
            while True:
                try:
                    next(checker)
                except StopIteration:
                    break
                if checker.word.capitalize() == checker.word:
                    # Skip capitalized words, which are usually names
                    # and thus not always found in dictionaries.
                    continue
                suggestions = []
                for i, suggestion in enumerate(checker.suggest()):
                    if suggestion.find(" ") > 0:
                        if suggestion.replace(" ", "") == checker.word:
                            suggestions.append(suggestion)
                # Split word only if only one two-word suggestion found that
                # has all the same characters as the original unsplit word.
                if len(suggestions) != 1: continue
                text = checker.get_text()
                a = checker.wordpos
                z = checker.wordpos + len(checker.word)
                checker.set_text(text[:a] + suggestions[0] + text[z:])
            new_text = checker.get_text()
            if new_text != text:
                new_indices.append(index)
                new_texts.append(new_text)
        if not new_indices: return
        self.replace_texts(new_indices, doc, new_texts, register=register)
        description = _("Splitting words by spell-check suggestions")
        self.set_action_description(register, description)
Example #5
0
    def spell_check_split_words(self, indices, doc, language, register=-1):
        """
        Split misspelled words based on spell-checker suggestions.

        Using this is usually not a good idea unless you have an insane
        dictionary that contains all possible compound words in `language`.
        Raise :exc:`enchant.Error` if dictionary instatiation fails.
        """
        new_indices = []
        new_texts = []
        re_multispace = re.compile(r" +")
        checker = self._get_enchant_checker(language)
        indices = indices or self.get_all_indices()
        for index in indices:
            subtitle = self.subtitles[index]
            text = subtitle.get_text(doc)
            text = re_multispace.sub(" ", text)
            checker.set_text(text)
            while True:
                try: next(checker)
                except StopIteration: break
                if checker.word.capitalize() == checker.word:
                    # Skip capitalized words, which are usually names
                    # and thus not always found in dictionaries.
                    continue
                suggestions = []
                for i, suggestion in enumerate(checker.suggest()):
                    if suggestion.find(" ") > 0:
                        if suggestion.replace(" ", "") == checker.word:
                            suggestions.append(suggestion)
                # Split word only if only one two-word suggestion found that
                # has all the same characters as the original unsplit word.
                if len(suggestions) != 1: continue
                text = checker.get_text()
                a = checker.wordpos
                z = checker.wordpos + len(checker.word)
                checker.set_text(text[:a] + suggestions[0] + text[z:])
            new_text = checker.get_text()
            if new_text != text:
                new_indices.append(index)
                new_texts.append(new_text)
        if not new_indices: return
        self.replace_texts(new_indices, doc, new_texts, register=register)
        description = _("Splitting words by spell-check suggestions")
        self.set_action_description(register, description)
Example #6
0
    def spell_check_join_words(self, indices, doc, language, register=-1):
        """
        Join misspelled words based on spell-checker suggestions.

        Raise :exc:`enchant.Error` if dictionary instatiation fails.
        """
        new_indices = []
        new_texts = []
        re_multispace = re.compile(r" +")
        checker = self._get_enchant_checker(language)
        seeker = self._get_enchant_checker(language)
        for index in indices or self.get_all_indices():
            subtitle = self.subtitles[index]
            text = subtitle.get_text(doc)
            text = re_multispace.sub(" ", text)
            checker.set_text(text)
            while True:
                try:
                    next(checker)
                except StopIteration:
                    break
                text = checker.get_text()
                a = checker.wordpos
                z = checker.wordpos + len(checker.word)
                ok_with_prev = ok_with_next = False
                if checker.leading_context(1) == " ":
                    seeker.set_text(text[:a - 1] + text[a:])
                    poss = self._get_misspelled_indices(seeker)
                    ok_with_prev = not (a - 1) in poss
                if checker.trailing_context(1) == " ":
                    seeker.set_text(text[:z] + text[z + 1:])
                    poss = self._get_misspelled_indices(seeker)
                    ok_with_next = not a in poss
                # Join backwards or forwards if only one direction,
                # but not both, produce a correctly spelled result.
                if ok_with_prev and not ok_with_next:
                    checker.set_text(text[:a - 1] + text[a:])
                if ok_with_next and not ok_with_prev:
                    checker.set_text(text[:z] + text[z + 1:])
            new_text = checker.get_text()
            if new_text != text:
                new_indices.append(index)
                new_texts.append(new_text)
        if not new_indices: return
        self.replace_texts(new_indices, doc, new_texts, register=register)
        description = _("Joining words by spell-check suggestions")
        self.set_action_description(register, description)
Example #7
0
    def spell_check_join_words(self, indices, doc, language, register=-1):
        """
        Join misspelled words based on spell-checker suggestions.

        Raise :exc:`enchant.Error` if dictionary instatiation fails.
        """
        new_indices = []
        new_texts = []
        re_multispace = re.compile(r" +")
        checker = self._get_enchant_checker(language)
        seeker = self._get_enchant_checker(language)
        for index in indices or self.get_all_indices():
            subtitle = self.subtitles[index]
            text = subtitle.get_text(doc)
            text = re_multispace.sub(" ", text)
            checker.set_text(text)
            while True:
                try: next(checker)
                except StopIteration: break
                text = checker.get_text()
                a = checker.wordpos
                z = checker.wordpos + len(checker.word)
                ok_with_prev = ok_with_next = False
                if checker.leading_context(1) == " ":
                    seeker.set_text(text[:a - 1] + text[a:])
                    poss = self._get_misspelled_indices(seeker)
                    ok_with_prev = not (a - 1) in poss
                if checker.trailing_context(1) == " ":
                    seeker.set_text(text[:z] + text[z + 1:])
                    poss = self._get_misspelled_indices(seeker)
                    ok_with_next = not a in poss
                # Join backwards or forwards if only one direction,
                # but not both, produce a correctly spelled result.
                if ok_with_prev and not ok_with_next:
                    checker.set_text(text[:a - 1] + text[a:])
                if ok_with_next and not ok_with_prev:
                    checker.set_text(text[:z] + text[z + 1:])
            new_text = checker.get_text()
            if new_text != text:
                new_indices.append(index)
                new_texts.append(new_text)
        if not new_indices: return
        self.replace_texts(new_indices, doc, new_texts, register=register)
        description = _("Joining words by spell-check suggestions")
        self.set_action_description(register, description)
Example #8
0
def spell_check(pkg, str, fmt, lang, ignored):

    dict_found = True
    warned = set()
    if enchant:
        if lang == 'C':
            lang = 'en_US'

        checker = _enchant_checkers.get(lang)
        if not checker and lang not in _enchant_checkers:
            try:
                checker = enchant.checker.SpellChecker(
                    lang, filters=[enchant.tokenize.EmailFilter,
                                   enchant.tokenize.URLFilter,
                                   enchant.tokenize.WikiWordFilter])
            except enchant.DictNotFoundError:
                printInfo(pkg, 'enchant-dictionary-not-found', lang)
                pass
            _enchant_checkers[lang] = checker

        if checker:
            # squeeze whitespace to ease leading context check
            checker.set_text(re.sub(r'\s+', ' ', str))
            if use_utf8:
                uppername = Pkg.to_unicode(pkg.header[rpm.RPMTAG_NAME]).upper()
            else:
                uppername = pkg.name.upper()
            upperparts = uppername.split('-')
            if lang.startswith('en'):
                ups = [x + "'S" for x in upperparts]
                upperparts.extend(ups)
            for err in checker:

                # Skip already warned and ignored words
                if err.word in warned or err.word in ignored:
                    continue

                # Skip all capitalized words that do not start a sentence
                if err.word[0].isupper() and not \
                        sentence_break_regex.search(checker.leading_context(3)):
                    continue

                upperword = err.word.upper()

                # Skip all uppercase words
                if err.word == upperword:
                    continue

                # Skip errors containing package name or equal to a
                # "component" of it, case insensitively
                if uppername in upperword or upperword in upperparts:
                    continue

                # Work around enchant's digit tokenizing behavior:
                # http://github.com/rfk/pyenchant/issues/issue/3
                if checker.leading_context(1).isdigit() or \
                        checker.trailing_context(1).isdigit():
                    continue

                # Warn and suggest
                sug = ', '.join(checker.suggest()[:3])
                if sug:
                    sug = '-> %s' % sug
                printWarning(pkg, 'spelling-error', fmt % lang, err.word, sug)
                warned.add(err.word)

        else:
            dict_found = False

    if not enchant or not dict_found:
        for seq in str.split():
            for word in re.split(r'[^a-z]+', seq.lower()):
                if len(word) == 0:
                    continue
                correct = BAD_WORDS.get(word)
                if not correct:
                    continue
                if word[0] == '\'':
                    word = word[1:]
                if word[-1] == '\'':
                    word = word[:-1]
                if word in warned or word in ignored:
                    continue
                printWarning(pkg, 'spelling-error', fmt % lang, word, '->',
                             correct)
                warned.add(word)
Example #9
0
def spell_check(pkg, str, fmt, lang, ignored):

    dict_found = True
    warned = set()
    if enchant:
        if lang == 'C':
            lang = 'en_US'

        checker = _enchant_checkers.get(lang)
        if not checker and lang not in _enchant_checkers:
            try:
                checker = enchant.checker.SpellChecker(
                    lang, filters=[enchant.tokenize.EmailFilter,
                                   enchant.tokenize.URLFilter,
                                   enchant.tokenize.WikiWordFilter])
            except enchant.DictNotFoundError:
                printInfo(pkg, 'enchant-dictionary-not-found', lang)
                pass
            _enchant_checkers[lang] = checker

        if checker:
            # squeeze whitespace to ease leading context check
            checker.set_text(re.sub(r'\s+', ' ', str))
            if use_utf8:
                uppername = Pkg.to_unicode(pkg.header[rpm.RPMTAG_NAME]).upper()
            else:
                uppername = pkg.name.upper()
            upperparts = uppername.split('-')
            if lang.startswith('en'):
                ups = [x + "'S" for x in upperparts]
                upperparts.extend(ups)
            for err in checker:

                # Skip already warned and ignored words
                if err.word in warned or err.word in ignored:
                    continue

                # Skip all capitalized words that do not start a sentence
                if err.word[0].isupper() and not \
                        sentence_break_regex.search(checker.leading_context(3)):
                    continue

                upperword = err.word.upper()

                # Skip all uppercase words
                if err.word == upperword:
                    continue

                # Skip errors containing package name or equal to a
                # "component" of it, case insensitively
                if uppername in upperword or upperword in upperparts:
                    continue

                # Work around enchant's digit tokenizing behavior:
                # http://github.com/rfk/pyenchant/issues/issue/3
                if checker.leading_context(1).isdigit() or \
                        checker.trailing_context(1).isdigit():
                    continue

                # Warn and suggest
                sug = ', '.join(checker.suggest()[:3])
                if sug:
                    sug = '-> %s' % sug
                printWarning(pkg, 'spelling-error', fmt % lang, err.word, sug)
                warned.add(err.word)

        else:
            dict_found = False

    if not enchant or not dict_found:
        for seq in str.split():
            for word in re.split('[^a-z]+', seq.lower()):
                if len(word) == 0:
                    continue
                correct = BAD_WORDS.get(word)
                if not correct:
                    continue
                if word[0] == '\'':
                    word = word[1:]
                if word[-1] == '\'':
                    word = word[:-1]
                if word in warned or word in ignored:
                    continue
                printWarning(pkg, 'spelling-error', fmt % lang, word, '->',
                             correct)
                warned.add(word)
Example #10
0
def check_lang(t):
    code, num, name = guess_language.guessLanguageInfo(t)
    checker = enchant.checker.SpellChecker(code,
                                           filters=[EmailFilter, URLFilter])
    checker.set_text(t)
    return code, [x.word for x in checker]