def test_case_lowering(self): T = tokenizer.TweetTokenizer(preserve_case=False) text = 'This is a tweet with Upper Cases and :-D emoticon' #make sure it doesn't lower case the emoticon actual = T.tokenize(text) expected = [ 'this', 'is', 'a', 'tweet', 'with', 'upper', 'cases', 'and', ':-D', 'emoticon' ] self.assertEqual(actual, expected)
import tokenizer import re T = tokenizer.TweetTokenizer() f = open('nyusha_nyusha.txt', 'r', encoding='utf-8') tweet = f.read() f.close() tokens = T.tokenize(tweet) scanner = re.Scanner([ (r"((https?:\/\/|www)|\w+\.(\w{2-3}))([\w\!#$&-;=\?\-\[\]~]|%[0-9a-fA-F]{2})+", lambda scanner, token: ("URL", token)), (r"^\+?\d{1}?[-\(]?\d{3}[\)-]?\d{3}-?\d{2}-?\d{2}$", lambda scanner, token: ("PHONE", token)), (r"(?:@\w+)", lambda scanner, token: ("USER", token)), (r"(?:\#\w+)", lambda scanner, token: ("HASHTAG", token)), (r"[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]", lambda scanner, token: ("EMAILS", token)), (r"<[^>\s]+>", lambda scanner, token: ("HTML-TAGS", token)), (r"[\-]+>|<[\-]+", lambda scanner, token: ("ASCII_ARROWS", token)), (r"#(?=\w+)", lambda scanner, token: ("HASH", token)), (r"^(RT)", lambda scanner, token: ("RETWIT", token)), (r"[😎|😘|☺|😍|☺|️️|❤|☝|🏻|🏻|🙈|🙌|😘|✨|😍|😋|😳|😌|🐾|✈|💋|😽|☀|💪|👉|🎬|😂|😇|✊|☕|😜|😇|😍|😘|😂|😎|😬|🔥|😍|😀]", lambda scanner, token: ("EMOJI", token)), (r"(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP*/\:\}\{@\|\\]|[\)\]\(\[dDpPсСрР3/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)", lambda scanner, token: ("SMILE", token)), (r'[!|$|%|&|"|,|.|;|:|-|?|*|«|»|\(|\)|-]+', lambda scanner, token: ("PUNCT", token)), (r"(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_])|(?:[+\-]?\d+[,/.:-]\d+[+\-]?)|(?:[\w_]+)|(?:\.(?:\s*\.){1,})|(?:\S)",lambda scanner, token: ("WORD", token)) ], re.UNICODE) f2 = open('output.txt', 'w', encoding='utf-8') for i in range(len(tokens)): results, remainder = scanner.scan(tokens[i]) for j in results: f2.write(str(i) + '\t' + j[0] + '\t' + j[1] + '\r\n')
notstopwords = set(('not', 'no', 'mustn', "mustn\'t")) stopwords = set(stopwords.words('english')) - notstopwords text_processor = TextPreProcessor( normalize=['url', 'email', 'user'], fix_html=True, # fix HTML tokens segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=True, tokenizer=SocialTokenizer(lowercase=True).tokenize) lemmatizer = WordNetLemmatizer() T = tokenizer.TweetTokenizer(preserve_handles=False, preserve_hashes=False, preserve_case=False, preserve_url=False, regularize=True) def data_preprocessing(path_tweets): tweets = pd.read_csv(path_tweets, encoding='utf-8', sep=',') tweets['text'] = tweets['text'].apply(lambda x: standardization(x)) tweets['sentiment'] = tweets['airline_sentiment'].apply( lambda x: 0 if x == 'negative' else (1 if x == 'neutral' else 2)) return tweets['text'], tweets['sentiment'] def data_preprocessing(path_tweets, corpora): data = pd.read_csv(path_tweets, encoding='utf-8',
def test_regularization(self): T = tokenizer.TweetTokenizer(regularize=True) text = "I'd've had to figure this out" actual = T.tokenize(text) expected = ['I', 'would', 'have', 'had', 'to', 'figure', 'this', 'out'] self.assertEqual(actual, expected)
def test_shortening(self): T = tokenizer.TweetTokenizer(preserve_len=False) text = 'This is a loooooong tweettttt' actual = T.tokenize(text) expected = ['This', 'is', 'a', 'looong', 'tweettt'] self.assertEqual(actual, expected)
def test_url_removal(self): T = tokenizer.TweetTokenizer(preserve_url=False) text = 'this is a url https://t.co/1234_MOD tweet' actual = T.tokenize(text) expected = ['this', 'is', 'a', 'url', 'tweet'] self.assertEqual(actual, expected)
def test_handle_removal(self): T = tokenizer.TweetTokenizer(preserve_handles=False) text = '@somehandle a tweet at that person' actual = T.tokenize(text) expected = ['a', 'tweet', 'at', 'that', 'person'] self.assertEqual(actual, expected)
def test_hash_removal(self): T = tokenizer.TweetTokenizer(preserve_hashes=False) text = 'this has #hash_tag and # separately' actual = T.tokenize(text) expected = ['this', 'has', 'hash_tag', 'and', '#', 'separately'] self.assertEqual(actual, expected)
def setUp(self): self.T = tokenizer.TweetTokenizer()
def test_emoji(self): T = tokenizer.TweetTokenizer(preserve_emoji=False) text = "This is a tweet with😊" #no space between text and emoji actual = T.tokenize(text) expected = ['This', 'is', 'a', 'tweet', 'with'] self.assertEqual(actual, expected)
def setUp(self): self.T = tokenizer.TweetTokenizer() self.redditT = tokenizer.RedditTokenizer()