def test_replace_urls(): texts = [ [ "I learned everything I know from www.stackoverflow.com and http://wikipedia.org/ and Mom.", "I learned everything I know from *URL* and *URL* and Mom.", ], [ "There's a bunch of references in that one scene alone, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29), which comes out later this year.", "There's a bunch of references in that one scene alone, including [Moana](*URL*), which comes out later this year.", ], ] for text, proc_text in texts: assert cleantext.replace_urls(text, "*URL*") == proc_text
def clean(cls, text): """Cleans text for language classification. Args: text (str): Source text Returns: str: Cleaned text """ import cleantext cleaned = text cleaned = cleantext.replace_urls(cleaned, replace_with='') cleaned = cleantext.replace_emails(cleaned, replace_with='') return cleaned
# !pip install clean-text[gpl] import cleantext # replacing urls text = "www.stackoverflow.com is an amzing website" cleantext.replace_urls(text, "<URL>") >>>'<URL> is an amzing website' # replacing emails text = "My email id is [email protected]" cleantext.replace_emails(text, "<EMAIL>") >>>'My email id is <EMAIL>'
def remove_urls(text: str, replace_with: str = None) -> str: """ Removes URLS from text""" return replace_urls(text, replace_with=replace_with)