def test_preprocessing_handles_hashtags(): """ Replaces hashtags with text """ text = "esto es #UnaGenialidad" assert preprocess_tweet(text) == "esto es una genialidad"
def test_preprocessing_replaces_users(): """ Replaces handles with special token for user """ text = "@perezjotaeme debería cambiar esto" assert preprocess_tweet(text) == "[USER] debería cambiar esto"
def test_shortens_laughters(): """ Replaces laughters """ text = "hahahhahaha can't believe it ahahahahahah" assert preprocess_tweet(text, lang="en") == "haha can't believe it haha"
def test_shortens_laughters(): """ Replaces laughters """ text = "jajajajaajjajaajajaja no lo puedo creer ajajaj" assert preprocess_tweet(text) == "jaja no lo puedo creer jaja"
def test_shortens_repeated_characters(): """ Replaces urls with special token for url """ text = "no entiendo naaaaaaaadaaaaaaaa" assert preprocess_tweet(text, shorten=2) == "no entiendo naadaa"
def test_preprocessing_replaces_urls(): """ Replaces urls with special token for url """ text = "esto es muy bueno http://bit.ly/sarasa" assert preprocess_tweet(text) == "esto es muy bueno [URL]"
def my_preprocess(*args): return preprocess_tweet(*args, **{ "user_token": "USUARIO", "url_token": "URL", "hashtag_token": "hashtag", "emoji_wrapper": "", })
def preprocess(tweet): """ My preprocess """ ret = preprocess_tweet(tweet, **preprocess_args) ret = re.sub("\n+", ". ", ret) ret = re.sub(r"\s+", " ", ret) return ret.strip()
def test_replaces_emoji_in_english(): """ Replaces “ -> " """ text = "🤣" assert preprocess_tweet( text, lang="en") == ' [EMOJI] rolling on the floor laughing [EMOJI] '
def test_replaces_emoji(): """ Replaces “ -> " """ text = "🤣" assert preprocess_tweet( text) == ' [EMOJI] cara revolviéndose de la risa [EMOJI] '
def test_replaces_odd_quotation_marks(): """ Replaces “ -> " """ text = "Pero pará un poco, “loquita”" assert preprocess_tweet(text) == 'Pero pará un poco, "loquita"'
def my_preprocess(tweet): ret = preprocess_tweet(tweet, **preprocess_args) ret = re.sub("\n+", ". ", ret) ret = re.sub(r"\s+", " ", ret) return ret.strip()