def _process_text(self, text, **kw): """ Preprocess text. """ # always lower case + unidecode text = unicode( unidecode(text.lower().decode('utf-8')), errors='ignore') # optionally remove punctuation if kw.get('remove_punct', True): text = "".join(map(lambda x: x if x not in punct else " ", text)) # optionally remove digits if kw.get('remove_digits', True): text = "".join(map(lambda x: x if x not in digits else " ", text)) # optionally remove whitespace if kw.get('remove_html', True): text = html.strip_tags(text) # optionally remove whitespace if kw.get('remove_whitespace', True): text = re_whitespace.sub(" ", text).strip() return text
def _process_text(self, text, **kw): """ Preprocess text. """ # always lower case + unidecode text = unicode(unidecode(text.lower().decode('utf-8')), errors='ignore') # optionally remove punctuation if kw.get('remove_punct', True): text = "".join(map(lambda x: x if x not in punct else " ", text)) # optionally remove digits if kw.get('remove_digits', True): text = "".join(map(lambda x: x if x not in digits else " ", text)) # optionally remove whitespace if kw.get('remove_html', True): text = html.strip_tags(text) # optionally remove whitespace if kw.get('remove_whitespace', True): text = re_whitespace.sub(" ", text).strip() return text
def prepare(s): """ Prepare text. """ s = unicode_symbols(s) s = re_whitespace.sub(' ', s).strip() return unidecode(s)
def prepare(s): """ Prepare text. """ s = unicode_symbols(s) s = re_whitespace.sub(' ', s).strip() try: s = unidecode(s) except Warning: pass return s
def _process_text(self, text, **kw): """ Preprocess text. """ # optionally remove punctuation if kw.get('rm_punct', True): text = "".join(map(lambda x: x if x not in punct else " ", text)) # optionally remove digits if kw.get('rm_digits', True): text = "".join(map(lambda x: x if x not in digits else " ", text)) # optionally remove whitespace if kw.get('rm_html', True): text = html.strip_tags(text) # optionally remove whitespace if kw.get('rm_whitespace', True): text = re_whitespace.sub(" ", text).strip() return text
def _process_text(self, text, **kw): """ Preprocess text. """ # optionally remove punctuation if kw.get("rm_punct", True): text = "".join(map(lambda x: x if x not in punct else " ", text)) # optionally remove digits if kw.get("rm_digits", True): text = "".join(map(lambda x: x if x not in digits else " ", text)) # optionally remove whitespace if kw.get("rm_html", True): text = html.strip_tags(text) # optionally remove whitespace if kw.get("rm_whitespace", True): text = re_whitespace.sub(" ", text).strip() return text