def default_cleaner(text: str, fix_unicode: bool = True, lowercase: bool = True, transliterate: bool = True, no_urls: bool = True, no_emails: bool = True, no_phone_numbers: bool = True, no_numbers: bool = True, no_currency_symbols: bool = True, no_punct: bool = True, no_accents: bool = True) -> str: """Default function to clean text.""" if fix_unicode: text = normalize_unicode(text, form='NFC') if transliterate is True: text = unidecode(text) if lowercase is True: text = text.lower() if no_urls: text = replace_urls(text, '<URL>') if no_emails is True: text = replace_emails(text, '<EMAIL>') if no_phone_numbers is True: text = replace_phone_numbers(text, '<PHONE>') if no_numbers is True: text = replace_numbers(text, '<NUMBER>') if no_currency_symbols is True: text = replace_currency_symbols(text, '<CUR>') if no_accents is True: text = remove_accents(text) if no_punct is True: text = remove_punctuation(text) return normalize_whitespace(text)
def process(self, tweets, replace_urls=True, replace_phone_numbers=True, replace_currency_symbols=True, remove_accent=True, remove_punctuation=True): tweet_text = tweets['text'].values clean_text = [x.lower() for x in tweet_text] if replace_urls: clean_text = [replace.replace_urls(x, 'url') for x in clean_text] if replace_phone_numbers: clean_text = [replace.replace_phone_numbers(x) for x in clean_text] if replace_currency_symbols: clean_text = [ replace.replace_currency_symbols(x) for x in clean_text ] if remove_accent: clean_text = [remove.remove_accents(x) for x in clean_text] if remove_punctuation: clean_text = [remove.remove_punctuation(x) for x in clean_text] return clean_text
def preprocess(text): text = ' '.join(text) temp_text = rep.replace_currency_symbols(text, replace_with='_CUR_') temp_text = rep.replace_emails(temp_text, replace_with='_EMAIL_') temp_text = rep.replace_emojis(temp_text, replace_with='_EMOJI_') temp_text = rep.replace_hashtags(temp_text, replace_with='_TAG_') temp_text = rep.replace_numbers(temp_text, replace_with='_NUMBER_') temp_text = rep.replace_phone_numbers(temp_text, replace_with='_PHONE_') temp_text = rep.replace_urls(temp_text, replace_with='_URL_') temp_text = rep.replace_user_handles(temp_text, replace_with='_USER_') doc = nlp(temp_text) tokens = [] for t in doc: tokens.append(t.text) return tokens