def clean(cls, text): """Cleans text for language classification. Args: text (str): Source text Returns: str: Cleaned text """ import cleantext cleaned = text cleaned = cleantext.replace_urls(cleaned, replace_with='') cleaned = cleantext.replace_emails(cleaned, replace_with='') return cleaned
def test_replace_emails(): text = "I can be reached at [email protected] through next Friday." proc_text = "I can be reached at *EMAIL* through next Friday." assert cleantext.replace_emails(text, "*EMAIL*") == proc_text
# !pip install clean-text[gpl] import cleantext # replacing urls text = "www.stackoverflow.com is an amzing website" cleantext.replace_urls(text, "<URL>") >>>'<URL> is an amzing website' # replacing emails text = "My email id is [email protected]" cleantext.replace_emails(text, "<EMAIL>") >>>'My email id is <EMAIL>'
def test_not_email_addresses(): for x in not_email_addresses: assert cleantext.replace_emails(x, "*EMAIL*") != "*EMAIL*"