def test_clean_text(): """test text_clean""" text_cleaner = TextCleaner("corpse_bride.txt") text_cleaner.text = "Hi, Mr.Lee -went to the park. Let's go" text_cleaner.clean_text() assert text_cleaner.sentence == [ "hi COMMA mr lee went to the park", " let's go" ]
def cleaned_comments(subreddit): tc = TextCleaner() return [tc.clean_text(line).text for line in comments(subreddit)]
class LanguageProcessor: """ Contains methods for cleaning or enhancing sentences in some languages. """ rxNum = re.compile('^[0-9]+$') rxLat = re.compile('^[a-z.-]+$') def __init__(self, lang): self.lang = lang self.tc = TextCleaner(lang) def clean(self, s): """ Perform basic text cleaning operations. """ return self.tc.clean_text(s) def replace_numerals_kpv(self, s): if LanguageProcessor.rxNum.search(s) is None: return s if len(s) == 1: if s == '1': return 'ӧтик' elif s == '2': return 'кык' elif s == '3': return 'куим' elif s == '4': return 'нёль' elif s == '5': return 'вит' elif s == '6': return 'квайт' elif s == '7': return 'сизим' elif s == '8': return 'кӧкъямыс' elif s == '9': return 'ӧкмыс' elif len(s) == 2: if s == '10': return 'дас' elif s == '20': return 'кызь' elif s[0] == '2': return 'кызь ' + self.replace_numerals_kpv(s[1]) elif s == '30': return 'комын' elif s[0] == '3': return 'комын ' + self.replace_numerals_kpv(s[1]) elif s == '40': return 'нелямын' elif s[0] == '4': return 'нелямын ' + self.replace_numerals_kpv(s[1]) elif s == '50': return 'ветымын' elif s[0] == '5': return 'ветымын ' + self.replace_numerals_kpv(s[1]) elif s == '60': return 'квайтымын' elif s[0] == '6': return 'квайтымын ' + self.replace_numerals_kpv(s[1]) elif s == '70': return 'сизимдас' elif s[0] == '7': return 'сизимдас ' + self.replace_numerals_kpv(s[1]) elif s == '80': return 'кöкъямысдас' elif s[0] == '8': return 'кöкъямысдас ' + self.replace_numerals_kpv(s[1]) elif s == '90': return 'öкмысдас' elif s[0] == '9': return 'öкмысдас ' + self.replace_numerals_kpv(s[1]) elif len(s) == 3: if s == '100': return 'сё' elif len(s) == 4: if s == '1000': return 'сюрс' elif s[1:] == '000': return self.replace_numerals_kpv(s[0]) + ' сюрс' elif s[1] == '0': return self.replace_numerals_kpv( s[0]) + ' сюрс ' + self.replace_numerals_kpv(s[2:]) elif s[:2] == '19': return 'сюрс öкмыс сё ' + self.replace_numerals_kpv(s[2:]) return s def replace_latin(self, s): if LanguageProcessor.rxLat.search(s) is not None: return '' return s def replace_abbr(self, s): if s == 'кр': return 'коми республика' if s == 'кг': return 'килограмм' return s def process_word(self, s): s = self.clean(s.lower()) if self.lang == 'kpv': s = self.replace_numerals_kpv(s) s = self.replace_abbr(s) s = self.replace_latin(s) return s