Ejemplo n.º 1
0
def aspell_clean():

    starts_with_cap = re.compile('^[A-Z]')
    lang = get_lang()
    checker = spell_checker.AspellSpellChecker(lang)
    bad_word_ctr = Counter()
    for fn in os.listdir('text/clean'):
        if fn.endswith('.txt'):
            for bad_word in checker.check_document('text/clean/{}'.format(fn)):
                bad_word_ctr[bad_word] += 1
    with codecs.open('working/maybe_ok.txt', mode='wb', encoding='utf-8') as f:
        for bad_word, count in bad_word_ctr.most_common():
            if starts_with_cap.match(bad_word):
                f.write(u'{} ({})\n'.format(spell_checker._decode(bad_word), count))
        for bad_word, count in bad_word_ctr.most_common():
            if not starts_with_cap.match(bad_word):
                f.write(u'{} ({})\n'.format(spell_checker._decode(bad_word), count))
Ejemplo n.º 2
0
def aspell_clean():

    starts_with_cap = re.compile('^[A-Z]')
    lang = get_lang()
    checker = spell_checker.AspellSpellChecker(lang)
    bad_word_ctr = Counter()
    for fn in os.listdir('text/clean'):
        if fn.endswith('.txt'):
            for bad_word in checker.check_document('text/clean/{}'.format(fn)):
                bad_word_ctr[bad_word] += 1
    with codecs.open('working/maybe_ok.txt', mode='wb', encoding='utf-8') as f:
        for bad_word, count in bad_word_ctr.most_common():
            if starts_with_cap.match(bad_word):
                f.write(u'{} ({})\n'.format(spell_checker._decode(bad_word),
                                            count))
        for bad_word, count in bad_word_ctr.most_common():
            if not starts_with_cap.match(bad_word):
                f.write(u'{} ({})\n'.format(spell_checker._decode(bad_word),
                                            count))
    def make_word_fix_doc(self, dir_):
        checker = self.spell_checker
        bad_words = set()

	bad_bad_map = {}
        word_set = set()
        for fn in os.listdir(dir_):
            if fn.endswith('.txt'):
                with codecs.open('{}/{}'.format(dir_, fn), mode='rb', encoding='utf-8') as f:
                    for l in f:
                        word_set.update(l.split())
	print '{} words'.format(len(word_set))
 
        words = list(word_set)
        for idx, bad_word_key in enumerate(self.spell_checker.failed_words(words)):
            if bad_word_key:
                bad_word = words[idx]
                bad_words.add(spell_checker._decode(bad_word))
                bad_bad_map[words[idx]] = bad_word
	print '{} bad words'.format(len(bad_words))

        fixes = self.fixed_words(bad_words)
	print '{} fixes'.format(len(fixes))
	still_bad = Counter()
        solos = []
        multis = []
        for bad_version, bad_word in sorted(bad_bad_map.items(), key=lambda x: x[0]):
            try:
                good_versions = fixes[bad_word]
                # remove hyphened versions if others are present
                if not '-' in bad_word:
                    good_unhyphened = [word for word in good_versions if not '-' in word]
                    if good_unhyphened:
                        good_versions = good_unhyphened
                fixed_good_versions = []
                for version in good_versions:
                    fixed_good_versions.append(bad_version.replace(bad_word, version))
                if len(fixed_good_versions) > 1:
                    multis.append((bad_version, fixed_good_versions,))
                elif fixed_good_versions:
                    solos.append((bad_version, fixed_good_versions,))
            except KeyError:
                still_bad[bad_word] += 1
        with codecs.open('{}/word_fixes.txt'.format(self.output_dir), mode='wb', encoding='utf-8') as f:
            for bad_version, fixed_good_versions in multis:
                f.write(u'{}|{}\n'.format(bad_version, self.delimiter.join(fixed_good_versions)))
            for bad_version, fixed_good_versions in solos:
                f.write(u'{}|{}\n'.format(bad_version, self.delimiter.join(fixed_good_versions)))
        with codecs.open('{}/bad_words.txt'.format(self.output_dir), mode='wb', encoding='utf-8') as f:
            for bad_word, cnt in still_bad.most_common():
                f.write(u'{:>20}: {:>3}\n'.format(bad_word, cnt))
    def make_word_fix_doc(self, dir_):
        checker = self.spell_checker
        bad_words = set()

        bad_bad_map = {}
        word_set = set()
        for fn in os.listdir(dir_):
            if fn.endswith('.txt'):
                with codecs.open('{}/{}'.format(dir_, fn),
                                 mode='rb',
                                 encoding='utf-8') as f:
                    for l in f:
                        word_set.update(l.split())
        print '{} words'.format(len(word_set))

        words = list(word_set)
        for idx, bad_word_key in enumerate(
                self.spell_checker.failed_words(words)):
            if bad_word_key:
                bad_word = words[idx]
                bad_words.add(spell_checker._decode(bad_word))
                bad_bad_map[words[idx]] = bad_word
        print '{} bad words'.format(len(bad_words))

        fixes = self.fixed_words(bad_words)
        print '{} fixes'.format(len(fixes))
        still_bad = Counter()
        solos = []
        multis = []
        for bad_version, bad_word in sorted(bad_bad_map.items(),
                                            key=lambda x: x[0]):
            try:
                good_versions = fixes[bad_word]
                # remove hyphened versions if others are present
                if not '-' in bad_word:
                    good_unhyphened = [
                        word for word in good_versions if not '-' in word
                    ]
                    if good_unhyphened:
                        good_versions = good_unhyphened
                fixed_good_versions = []
                for version in good_versions:
                    fixed_good_versions.append(
                        bad_version.replace(bad_word, version))
                if len(fixed_good_versions) > 1:
                    multis.append((
                        bad_version,
                        fixed_good_versions,
                    ))
                elif fixed_good_versions:
                    solos.append((
                        bad_version,
                        fixed_good_versions,
                    ))
            except KeyError:
                still_bad[bad_word] += 1
        with codecs.open('{}/word_fixes.txt'.format(self.output_dir),
                         mode='wb',
                         encoding='utf-8') as f:
            for bad_version, fixed_good_versions in multis:
                f.write(u'{}|{}\n'.format(
                    bad_version, self.delimiter.join(fixed_good_versions)))
            for bad_version, fixed_good_versions in solos:
                f.write(u'{}|{}\n'.format(
                    bad_version, self.delimiter.join(fixed_good_versions)))
        with codecs.open('{}/bad_words.txt'.format(self.output_dir),
                         mode='wb',
                         encoding='utf-8') as f:
            for bad_word, cnt in still_bad.most_common():
                f.write(u'{:>20}: {:>3}\n'.format(bad_word, cnt))