Beispiel #1
0
def default_cleaner(text: str,
                    fix_unicode: bool = True,
                    lowercase: bool = True,
                    transliterate: bool = True,
                    no_urls: bool = True,
                    no_emails: bool = True,
                    no_phone_numbers: bool = True,
                    no_numbers: bool = True,
                    no_currency_symbols: bool = True,
                    no_punct: bool = True,
                    no_accents: bool = True) -> str:
    """Default function to clean text."""
    if fix_unicode:
        text = normalize_unicode(text, form='NFC')
    if transliterate is True:
        text = unidecode(text)
    if lowercase is True:
        text = text.lower()
    if no_urls:
        text = replace_urls(text, '<URL>')
    if no_emails is True:
        text = replace_emails(text, '<EMAIL>')
    if no_phone_numbers is True:
        text = replace_phone_numbers(text, '<PHONE>')
    if no_numbers is True:
        text = replace_numbers(text, '<NUMBER>')
    if no_currency_symbols is True:
        text = replace_currency_symbols(text, '<CUR>')
    if no_accents is True:
        text = remove_accents(text)
    if no_punct is True:
        text = remove_punctuation(text)
    return normalize_whitespace(text)
    def process(self,
                tweets,
                replace_urls=True,
                replace_phone_numbers=True,
                replace_currency_symbols=True,
                remove_accent=True,
                remove_punctuation=True):
        tweet_text = tweets['text'].values
        clean_text = [x.lower() for x in tweet_text]

        if replace_urls:
            clean_text = [replace.replace_urls(x, 'url') for x in clean_text]

        if replace_phone_numbers:
            clean_text = [replace.replace_phone_numbers(x) for x in clean_text]

        if replace_currency_symbols:
            clean_text = [
                replace.replace_currency_symbols(x) for x in clean_text
            ]

        if remove_accent:
            clean_text = [remove.remove_accents(x) for x in clean_text]

        if remove_punctuation:
            clean_text = [remove.remove_punctuation(x) for x in clean_text]

        return clean_text
Beispiel #3
0
def preprocess(text):
    text = ' '.join(text)
    temp_text = rep.replace_currency_symbols(text, replace_with='_CUR_')
    temp_text = rep.replace_emails(temp_text, replace_with='_EMAIL_')
    temp_text = rep.replace_emojis(temp_text, replace_with='_EMOJI_')
    temp_text = rep.replace_hashtags(temp_text, replace_with='_TAG_')
    temp_text = rep.replace_numbers(temp_text, replace_with='_NUMBER_')
    temp_text = rep.replace_phone_numbers(temp_text, replace_with='_PHONE_')
    temp_text = rep.replace_urls(temp_text, replace_with='_URL_')
    temp_text = rep.replace_user_handles(temp_text, replace_with='_USER_')

    doc = nlp(temp_text)
    tokens = []
    for t in doc:
        tokens.append(t.text)
    return tokens