def test_split_hashtag_with_numbers(self): """ Test that hashtags are treated as words when splitting hashtags. """ cleaner = TweetCleaner(split_hashtags=True, collapse_whitespaces=True) text = "The Vardy party has gone very quiet π€ π’ #EPL2020" self.assertEqual("The Vardy party has gone very quiet π€ π’ EPL 2020", cleaner.clean(text)) text = "The Vardy party has gone very quiet π€ π’ #2020EPL" self.assertEqual("The Vardy party has gone very quiet π€ π’ 2020 EPL", cleaner.clean(text))
def test_replace_mentions_all(self): """ Test that after replacing mentions, there are no '@' symbols. """ cleaner = TweetCleaner(replace_mentions=True) wrong_pattern = re.compile("@[0-9,\\sβ¦]") no_space_pattern = re.compile("[^\\s]@") end_pattern = re.compile('@$') corpus = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'tests', 'corpora', 'understanding', 'CRYCHE.json') with open(corpus) as f: for i, line in enumerate(f): tweet = json.loads(line) original = tweet while "retweeted_status" in tweet: tweet = tweet["retweeted_status"] if "extended_tweet" in tweet: text = tweet["extended_tweet"].get("full_text", tweet.get("text", "")) else: text = tweet.get("text", "") if "quoted_status" in tweet: tweet = tweet['quoted_status'] if "extended_tweet" in tweet: text += ' ' + tweet["extended_tweet"].get( "full_text", tweet.get("text", "")) else: text += ' ' + tweet.get("text", "") cleaned = cleaner.clean(text, original) """ Allow for some manual validation. """ not_accounts = [ 'real_realestsounds', 'nevilleiesta', 'naija927', 'naijafm92.7', 'manchesterunited', 'ManchesterUnited', 'clintasena', 'Maksakal88', 'Aubamayeng7', 'JustWenginIt', 'marcosrojo5', 'btsportsfootball', 'Nsibirwahall', 'YouTubeγγ', 'juniorpepaseed', 'Mezieblog', 'UtdAlamin', 'spurs_vincente' ] if '@' in cleaned: if '@@' in text or ' @ ' in text or '@>' in text or any( account in text for account in not_accounts): continue if end_pattern.findall(text): continue if no_space_pattern.findall( text) or no_space_pattern.findall(cleaned): continue if wrong_pattern.findall(text): continue self.assertFalse('@' in cleaned)
def load_corpus(filename, clean): """ Load the corpus from the given filename. :param filename: The path to the corpus from where to detect participants. :type filename: str :param clean: A boolean indicating whether tweets should be cleaned while loading them. :type clean: bool :return: A list of :class:`~nlp.document.Document` making up the corpus. :rtype: list of :class:`~nlp.document.Document` """ cleaner = TweetCleaner(replace_mentions=True) corpus = [] with open(filename) as f: for i, line in enumerate(f): tweet = json.loads(line) original = tweet while "retweeted_status" in tweet: tweet = tweet["retweeted_status"] if "extended_tweet" in tweet: text = tweet["extended_tweet"].get("full_text", tweet.get("text", "")) else: text = tweet.get("text", "") text = cleaner.clean(text, original) if clean else text document = Document(text) corpus.append(document) return corpus
def test_remove_unicode_entities_retain(self): """ Test that when unicode character removal is not specified, these characters are retained. """ cleaner = TweetCleaner(remove_unicode_entities=False) text = '\u0632\u0648\u062f_\u0641\u0648\u0644\u0648\u0631\u0632_\u0645\u0639_\u0627\u0644\u0645\u0628\u0627\u062d\u062b' self.assertEqual('Ψ²ΩΨ―_ΩΩΩΩΨ±Ψ²_Ω ΨΉ_Ψ§ΩΩ Ψ¨Ψ§ΨΨ«', cleaner.clean(text))
def test_remove_unicode_entities_retain_emojis(self): """ Test that when unicode character removal is not specified, emojis are retained. """ cleaner = TweetCleaner(remove_unicode_entities=False) text = 'Je veux ππππ¦' self.assertEqual('Je veux ππππ¦', cleaner.clean(text))
def test_remove_retweet_prefix_middle(self): """ Test that when a retweet prefix is in the middle of the tweet, it is not removed. """ cleaner = TweetCleaner(remove_retweet_prefix=True) text = "Great podcast episode RT @NicholasMamo: the repercussions of the ongoing pandemic on French football, as well as a brilliant short segment on how we're giving too much importance to TV rights, and too little to the supporters." self.assertEqual(text, cleaner.clean(text))
def test_remove_retweet_prefix_empty(self): """ Test that when an empty tweet is given, the exact same tweet is returned. """ cleaner = TweetCleaner(remove_retweet_prefix=True) text = "" self.assertEqual(text, cleaner.clean(text))
def test_remove_unicode_entities(self): """ Test that the unicode entity removal functionality removes unicode characters. """ cleaner = TweetCleaner(remove_unicode_entities=True) text = '\u0632\u0648\u062f_\u0641\u0648\u0644\u0648\u0631\u0632_\u0645\u0639_\u0627\u0644\u0645\u0628\u0627\u062d\u062b' self.assertEqual('___', cleaner.clean(text))
def test_strip_after_processing(self): """ Test that the text is stripped after all processing. """ cleaner = TweetCleaner(remove_unicode_entities=True) text = 'Je veux ππππ¦' self.assertEqual('Je veux', cleaner.clean(text))
def test_remove_unicode_entities_includes_emojis(self): """ Test that the unicode entity removal functionality also removes emojis. """ cleaner = TweetCleaner(remove_unicode_entities=True) text = 'Je veux ππππ¦' self.assertEqual('Je veux', cleaner.clean(text))
def test_remove_retweet_prefix_retain(self): """ Test that when the flag to remove the retweet prefix is not given, it is retained. """ cleaner = TweetCleaner(remove_retweet_prefix=False) text = "RT @NicholasMamo: Great podcast episode about the repercussions of the ongoing pandemic on French football, as well as a brilliant short segment on how we're giving too much importance to TV rights, and too little to the supporters." self.assertEqual(text, cleaner.clean(text))
def test_remove_retweet_prefix_without_prefix(self): """ Test that when a tweet without a retweet prefix is given, the exact same tweet is returned. """ cleaner = TweetCleaner(remove_retweet_prefix=True) text = "Great podcast episode about the repercussions of the ongoing pandemic on French football, as well as a brilliant short segment on how we're giving too much importance to TV rights, and too little to the supporters." self.assertEqual(text, cleaner.clean(text))
def test_split_hashtag_all_lower(self): """ Test that trying to split a hashtag that is made up of only lowercase letters does not split it. """ cleaner = TweetCleaner(remove_hashtags=False, split_hashtags=True) text = "The Vardy party has gone very quiet π€ π’ #fpl" self.assertEqual("The Vardy party has gone very quiet π€ π’ #fpl", cleaner.clean(text))
def test_split_hashtag(self): """ Test the hashtag splitting functionality. """ cleaner = TweetCleaner(split_hashtags=True, collapse_whitespaces=True) text = "The Vardy party has gone very quiet π€ π’ #LeiChe" self.assertEqual("The Vardy party has gone very quiet π€ π’ Lei Che", cleaner.clean(text))
def test_remove_hashtags_mixed_case(self): """ Test that the hashtag removal functionality removes all hashtags, regardless of the case. """ cleaner = TweetCleaner(remove_hashtags=True, split_hashtags=False) text = "The Vardy party has gone very quiet π€ π’ #FPL #LeiChe" self.assertEqual("The Vardy party has gone very quiet π€ π’", cleaner.clean(text))
def test_remove_hashtags_multiple(self): """ Test that the hashtag removal functionality removes all hashtags. """ cleaner = TweetCleaner(remove_hashtags=True) text = "The Vardy party has gone very quiet π€ π’ #FPL #LEICHE" self.assertEqual("The Vardy party has gone very quiet π€ π’", cleaner.clean(text))
def test_do_not_split_hashtags(self): """ Test that hashtags aren't split if the flag is not provided. """ cleaner = TweetCleaner(remove_hashtags=False, split_hashtags=False) text = "The Vardy party has gone very quiet π€ π’ #EPL2020" self.assertEqual("The Vardy party has gone very quiet π€ π’ #EPL2020", cleaner.clean(text))
def test_remove_hashtags(self): """ Test that the hashtag removal functionality retains all hashtags when not requested. """ cleaner = TweetCleaner(remove_hashtags=False, split_hashtags=False) text = "The Vardy party has gone very quiet π€ π’ #FPL #LEICHE" self.assertEqual( "The Vardy party has gone very quiet π€ π’ #FPL #LEICHE", cleaner.clean(text))
def test_remove_url_with_subdomain_without_protocol(self): """ Test that URL removal includes subdomains even if they have no protocol. """ cleaner = TweetCleaner(remove_urls=True) text = 'Visit Multiplex\'s documentation for more information: nicholasmamo.github.io/multiplex-plot/' self.assertEqual( 'Visit Multiplex\'s documentation for more information:', cleaner.clean(text))
def test_split_hashtag_multiple_components(self): """ Test that hashtags with multiple components are split properly. """ cleaner = TweetCleaner(split_hashtags=True, collapse_whitespaces=True) text = "Hello! I'm Harry Styles, I'm sixteen and I work in a bakery #HappyBirthdayHarry" self.assertEqual( "Hello! I'm Harry Styles, I'm sixteen and I work in a bakery Happy Birthday Harry", cleaner.clean(text))
def test_remove_url_with_http_protocol(self): """ Test the URL removal functionality when the protocol is http. """ cleaner = TweetCleaner(remove_urls=True) text = 'Thank you @BillGates. It\'s amazing, almost as incredible as the fact that you use Gmail. http://t.co/drawyFHHQM' self.assertEqual( 'Thank you @BillGates. It\'s amazing, almost as incredible as the fact that you use Gmail.', cleaner.clean(text))
def test_remove_retweet_prefix_consecutive(self): """ Test that when there are consecutive retweet prefixes, only the first one is removed. """ cleaner = TweetCleaner(remove_retweet_prefix=True) text = "RT @NicholasMamo: RT @NicholasMamo: Great podcast episode about the repercussions of the ongoing pandemic on French football, as well as a brilliant short segment on how we're giving too much importance to TV rights, and too little to the supporters." self.assertEqual( "RT @NicholasMamo: Great podcast episode about the repercussions of the ongoing pandemic on French football, as well as a brilliant short segment on how we're giving too much importance to TV rights, and too little to the supporters.", cleaner.clean(text))
def test_split_hashtag_repeated(self): """ Test that when a hashtag is repeated, splitting is applied to both. """ cleaner = TweetCleaner(split_hashtags=True, collapse_whitespaces=True) text = "The Vardy party has gone very quiet π€ π’ #LeiChe #LeiChe" self.assertEqual( "The Vardy party has gone very quiet π€ π’ Lei Che Lei Che", cleaner.clean(text))
def test_remove_url_with_subdomain(self): """ Test that URL removal includes subdomains. """ cleaner = TweetCleaner(remove_urls=True) text = 'Visit Multiplex\'s documentation for more information: https://nicholasmamo.github.io/multiplex-plot/' self.assertEqual( 'Visit Multiplex\'s documentation for more information:', cleaner.clean(text))
def test_remove_url_retain(self): """ Test the URL retention functionality. """ cleaner = TweetCleaner(remove_urls=True) text = 'Thank you @BillGates. It\'s amazing, almost as incredible as the fact that you use Gmail. https://t.co/drawyFHHQM' self.assertEqual( 'Thank you @BillGates. It\'s amazing, almost as incredible as the fact that you use Gmail.', cleaner.clean(text))
def test_remove_hashtags_with_splitting(self): """ Test that when hashtags are removed, split hashtags are retained. """ cleaner = TweetCleaner(remove_hashtags=True, split_hashtags=True, collapse_whitespaces=True) text = "The Vardy party has gone very quiet π€ π’ #FPL #LeiChe" self.assertEqual("The Vardy party has gone very quiet π€ π’ Lei Che", cleaner.clean(text))
def test_capitalize_first(self): """ Test that the tweet cleaner calls the function to capitalize the first character. """ cleaner = TweetCleaner(remove_alt_codes=True, complete_sentences=True, collapse_new_lines=True, collapse_whitespaces=True, capitalize_first=True) text = "allez l'OL." self.assertEqual(f"Allez l'OL.", cleaner.clean(text))
def test_replace_mentions_retain_unknown(self): """ Test that when there are unknown mentions, they are retained. """ cleaner = TweetCleaner(replace_mentions=True) text = "RT @Quantum_Stat: Python visualization library Multiplex: It looks amazing, great job @nicholasmamo" tweet = { 'entities': { 'user_mentions': [{ "screen_name": "NicholasMamo", "name": "Nicholas Mamo", }] } } self.assertEqual( "RT @Quantum_Stat: Python visualization library Multiplex: It looks amazing, great job Nicholas Mamo", cleaner.clean(text, tweet))
def test_replace_mentions_multiple_times(self): """ Test that when a mention appears multiple times, all such mentions are replaced. """ cleaner = TweetCleaner(replace_mentions=True) text = "Python visualization library Multiplex by @NicholasMamo: It looks amazing, great job @nicholasmamo" tweet = { 'entities': { 'user_mentions': [{ "screen_name": "NicholasMamo", "name": "Nicholas Mamo", }] } } self.assertEqual( "Python visualization library Multiplex by Nicholas Mamo: It looks amazing, great job Nicholas Mamo", cleaner.clean(text, tweet))
def test_replace_mentions_case_insensitive(self): """ Test that when replacing mentions, the replacement is case-insensitive. """ cleaner = TweetCleaner(replace_mentions=True) text = "Python visualization library Multiplex: It looks amazing, great job @nicholasmamo" tweet = { 'entities': { 'user_mentions': [{ "screen_name": "NicholasMamo", "name": "Nicholas Mamo", }] } } self.assertEqual( "Python visualization library Multiplex: It looks amazing, great job Nicholas Mamo", cleaner.clean(text, tweet))