def tokenize_tweet_content_to_types( dataset: pd.DataFrame, tokenize_type_list: List[str]) -> pd.DataFrame: """Tokenize all tweets with with defined contents i.e 'something #DataScience' with 'something $HASHTAG$' """ tuple_to_unpack = get_filter_objects_as_tuple(tokenize_type_list) p.set_options(*tuple_to_unpack) dataset["text"] = dataset["text"].apply(lambda txt: p.tokenize(txt)) return dataset
def cleaning(string): #Emoji patterns emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) string = emoji_pattern.sub(r"", string) string = re.sub("RT", "", string) string = re.sub("#", "", string) string = re.sub("\+", "", string) string = re.sub("\/", " or ", string) string = re.sub("&", " and ", string) string = re.sub("-", " ", string) string = re.sub("'", "", string) string = re.sub("🤔", "", string) string = re.sub("[¦]", "", string) string = re.sub("[()@:<>{}`=~|.,%_]", " ", string) #Replace short-forms & Acronyms with proper English string = translator(string) p.set_options(p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.MENTION) result = p.clean(string) return result
def gettweets(): ## this search uses tweepy.Cursor for larger tweet count, filters out retweets, english only, and uses extended tweet mode to get full tweet. for tweet in tweepy.Cursor(api.search, q='Reckful -filter:retweets', lang="en", tweet_mode='extended').items(300): ## the following is necessary to get full tweet (.full_text), instead of being limited to 120 chars (tweepy default) try: stringtweet = tweet.retweeted_status.full_text numlikes = tweet.retweeted_status.favorite_count retweetedstatus = "True" except AttributeError: # Not a Retweet stringtweet = tweet.full_text numlikes = tweet.favorite_count retweetedstatus = "False" # tweet preprocessor options, removes unnecessary characters. I chose to leave in hashtags and mentions, otherwise use p.OPT.HASHTAG and p.OPT.MENTION p.set_options(p.OPT.URL, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER) clean_text = p.clean(stringtweet) #Creates a sentimenttweet object from Textblob module, which contains tuple (polarity, subjectivity). Accessed on line 40. sentimenttweet = TextBlob(clean_text) csvWriter.writerow((clean_text, sentimenttweet.sentiment.polarity, sentimenttweet.sentiment.subjectivity, tweet.user.screen_name, tweet.user.location, numlikes, tweet.retweet_count, tweet.created_at))
def preprocess(tweet): tweet = tweet.lower() tweet = re.sub(r'\\n', ' ', tweet) tweet = re.sub(r'(\S)(https?):', r'\1 \2:', tweet) p.set_options(p.OPT.MENTION, p.OPT.URL, p.OPT.EMOJI, p.OPT.HASHTAG) tweet = p.tokenize(tweet) tokenizer = nltk.tokenize.TweetTokenizer() tweet = tokenizer.tokenize(tweet) tweet = ' '.join(tweet) tweet = re.sub(r'\$ ([A-Z]+?) \$', r'$\1$', tweet) tweet = tweet.split(' ') ### Stopwords removal ### language = os.getenv('LANGUAGE') stop_words = set(stopwords.words(language)) new_sentence = [] for w in tweet: if w not in stop_words: new_sentence.append(w) tweet = ' '.join(new_sentence) tweet = unidecode.unidecode(tweet) p.set_options(p.OPT.NUMBER) tweet = p.tokenize(tweet) tweet = re.sub(r'([!¡]\s?){3,}', r' $EXCLAMATION$ ', tweet) tweet = re.sub(r'([¿?]\s?){3,}', r' $QUESTION$ ', tweet) tweet = re.sub(r'(\.\s?){3,}', r' $ELLIPSIS$ ', tweet) tweet = re.sub(r'\b(?:a*(?:(h+|j+)a+|s+)+(h+|j+)?|(?:l+o+)+l+)\b', r' $LOL$ ', tweet, flags=re.I) return tweet
def cleanup_text(docs, logging=False): texts = [] full_tokens = [] counter = 1 for doc in docs: if counter % 1000 == 0 and logging: print("Processed %d out of %d documents." % (counter, len(docs))) counter += 1 pr.set_options(pr.OPT.URL, pr.OPT.EMOJI, pr.OPT.HASHTAG, pr.OPT.MENTION, pr.OPT.RESERVED, pr.OPT.SMILEY, pr.OPT.NUMBER) doc = pr.clean(doc) doc = con.fix(doc) doc = ''.join(ch for ch in doc if ord(ch) < 128) doc = nlp(doc, disable=['parser', 'ner']) tokens = [ tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-' ] tokens = [ tok for tok in tokens if tok not in stopwords and tok not in punctuations and re.sub("[0-9]*", '', tok) != '' ] full_tokens += tokens tokens = ' '.join(tokens) texts.append(tokens) rare_tuple = nltk.FreqDist(full_tokens).most_common()[-50:] rare_words = [tup[0] for tup in rare_tuple] final_texts = [] for text in texts: tokens = [tok for tok in text.split(' ') if tok not in rare_words] tokens = ' '.join(tokens) final_texts.append(tokens) return pd.Series(final_texts)
def preprocessor_tweet(s): tweet_p.set_options(tweet_p.OPT.EMOJI, tweet_p.OPT.URL, tweet_p.OPT.RESERVED, tweet_p.OPT.SMILEY, tweet_p.OPT.MENTION) s = re.sub(r'@petrogustavo', 'petrogustavo', s) s = re.sub(r'@sergio_fajardo', 'sergio_fajardo', s) s = re.sub(r'@IvanDuque','IvanDuque',s) s = re.sub(r'@AlvaroUribeVel','AlvaroUribeVel',s) s = re.sub(r'@JuanManSantos','JuanManSantos',s) s = re.sub(r'@German_Vargas','German_Vargas',s) s = re.sub(r'@ClaudiaLopez','ClaudiaLopez',s) s = re.sub(r'@DeLaCalleHum','DeLaCalleHum',s) s = tweet_p.clean(s) s = re.sub(r'\b(?:a*(?:ja)+h?|(?:l+o+)+l+)\b', ' ', s) s = re.sub(r'[^\w]', ' ', s) # s = re.sub(r'^https?:\/\/.*[\r\n]*', '', s) # s = re.sub(r'#', '', s) # s = re.sub(r'¡+', '', s) # s = re.sub(r':', '', s) # s = re.sub(r'!+', '', s) # s = re.sub(r'"', '', s) # s = re.sub(r'/[-?]/', '', s) # s = re.sub(r'¿+', '', s) # s = re.sub(r'@\w+', '', s) s = strip_accents_unicode(s.lower()) s = tweet_p.clean(s) return s
def clean(lang): ''' Create new folder for cleaned data. Remove duplicate Tweets and do some tweet cleaning. ''' merged_file = 'data/TweetText_Label/{}_tweet_label.csv'.format(lang) clean_file = 'data/clean/{}.csv'.format(lang) merged = pd.read_csv(merged_file, engine='python') #merged.drop_duplicates(['Tweet text'], inplace=True) p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED) text = [] labels = [] for index, row in merged.iterrows(): try: label = row['SentLabel'] clean_row = p.clean(row['Tweet text']) text.append(clean_row) labels.append(label) if index % 1000 == 0: print(index) except: continue cleaned = pd.DataFrame({'Text': text, 'HandLabels': labels}) cleaned.to_csv(clean_file)
def test_set_options(self): tweet = "Preprocessor now has custom #options support! https://github.com/s/preprocessor" p.set_options(p.OPT.URL) parsed_tweet = p.parse(tweet) self.assertIsNone(parsed_tweet.hashtags) self.assertIsNotNone(parsed_tweet.urls)
def __init__(self, setname, embeddings, example_length=30, range=None): assert setname in [ 'train1', 'train2', 'train3', 'train4', 'train', 'test', 'val' ] self.example_length = example_length self.setname = setname self.path = os.path.join(dir_path, setname) index = os.path.join(self.path, 'index.csv') # maps index of points in the dataset to tweet_ids self.index = pd.read_csv(index, index_col=0) if range is not None and len(range) == 2: start = range[0] end = range[1] self.index = self.index[start:end].reset_index() self.len = len(self.index) # define method for deleting urls p.set_options(p.OPT.URL) # remove only URLs self.clean = p.clean # patter that identifies all but alphanumeric characters and spaces self.pattern = re.compile(r'([^\s\w]|_)+') # get dict that maps word to embeddings self.embeddings = embeddings
def clean_tweet(tweet): # removes @ mentions, hashtags, emojis, twitter reserved words and numbers p.set_options(p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY, p.OPT.NUMBER) clean = p.clean(tweet) # transforms every url to "<url>" token and every hashtag to "<hashtag>" token p.set_options(p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY, p.OPT.NUMBER, p.OPT.HASHTAG, p.OPT.URL) clean = p.tokenize(clean) clean = re.sub(r'\$HASHTAG\$', '<hashtag>', clean) clean = re.sub(r'\$URL\$', '<url>', clean) # preprocessor doesn't seem to clean all emojis so we run text trough emoji regex to clean leftovers clean = re.sub(emoji.get_emoji_regexp(), '', clean) # removing zero-width character which is often bundled with emojis clean = re.sub(u'\ufe0f', '', clean) # remove multiple empty spaces with one clean = re.sub(r' +', ' ', clean) # replace > and < clean = re.sub(r'>', '>', clean) clean = re.sub(r'<', '<', clean) # strip any leftover spaces at the beginning and end clean = clean.strip() return clean
def first_step(directory, fname, ext, language): p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION) os.chdir(directory) f_path = fname f_out_p = fname.split(".")[0].replace('Stream', '') + "Preprocess1" + language.upper() + "." + ext fo = open(f_out_p, 'wb') if not ext == 'json': fo.write("id;created_at;text" + "\n") with open(f_path, 'r') as FILE: next(FILE) i = 0 for line in FILE: tweet = json.loads(line) if 'extended_tweet' in tweet: extended_tweet = tweet['extended_tweet'] text = extended_tweet['full_text'].encode('unicode_escape') else: text = tweet['text'].encode('unicode_escape') try: if detect(text) == language.lower(): text = p.clean(text) text = C.hexadecimal_conversion(text) text = C.expression_clean(text) if not ext == 'json': fo.write("%s;%s;%s\n" % (i, tweet['created_at'], text)) elif ext == 'json': twt = {"id": i, "created_at": tweet['created_at'], "text": text.decode('unicode_escape')} fo.write(json.dumps(twt) + '\n') i += 1 except lang_detect_exception.LangDetectException: print("Lang Detect exception for: ", text) fo.close() return
def csv_read_and_write(read_path, write_path1, write_path2): with open(write_path1, 'w') as outFile1, open(write_path2, 'w') as outFile2: file_writer1 = csv.writer(outFile1) file_writer2 = csv.writer(outFile2) i = 1 with open(read_path, 'r') as inFile: fileReader = csv.reader(inFile) for row in fileReader: tweet = row[4] p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG) cleaned_tweet = p.clean(tweet) cleaned_tweet = unicode(cleaned_tweet, 'utf-8') is_english = detect_language( cleaned_tweet ) # where we call the function that detects if it is english or not print(is_english) data = [ row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9] ] if is_english is True: file_writer1.writerow(data) else: file_writer2.writerow(data) i = i + 1
def clean_documents(documents): documents_clean = [] p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER) for d in documents: # Remove Unicode d = d.lower() # removing url,emoji,smiley,number document_test = p.clean(d) #remove stop_words document_test = remove_stopwords(document_test) document_test = re.sub(r'[^\x00-\x7F]+', ' ', document_test) # Remove Mentions document_test = re.sub(r'@\w+', '', document_test) # Lowercase the document # Remove punctuations document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test) # Lowercase the numbers document_test = re.sub(r'[0-9]', '', document_test) # Remove the doubled space documents_clean.append(document_test) return documents_clean
def preprocess_tweet(tweet): cleaned_tweet = tweet.lower() # lowercase the tweet p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.HASHTAG) # set options for the preprocessor cleaned_tweet = p.clean(cleaned_tweet.encode("ascii", "ignore")) #cleaned_tweet = remove_stopwords(cleaned_tweet) # remove stopwords #print cleaned_tweet return cleaned_tweet;
def clean_tweet(self, tweet): """ Uses tweet-preprocessor library to clean tweets if wanted. """ p.set_options(p.OPT.URL) cleaned_tweet = p.clean(tweet) return cleaned_tweet
def test_set_options(self): tweet = 'Preprocessor now has custom #options support! https://github.com/s/preprocessor' p.set_options(p.OPT.URL) parsed_tweet = p.parse(tweet) self.assertIsNone(parsed_tweet.hashtags) self.assertIsNotNone(parsed_tweet.urls)
def csv_read_and_write(read_path, write_path): with open (write_path, 'wb') as outFile1: file_writer1 = csv.writer(outFile1) i = 1; with open(read_path,'r') as inFile: fileReader = csv.reader(inFile) for row in fileReader: tweet = row[4] p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG) cleaned_tweet = p.clean(tweet) print(i) print(cleaned_tweet) data = [row[0], row[1], row[2], row[3], cleaned_tweet, row[5], row[6], row[7], row[8], row[9]] file_writer1.writerow(data) i = i + 1
def on_data(self, data): p.set_options(p.OPT.EMOJI, p.OPT.SMILEY) #print('\ncurrent_process = ',mp.current_process()) if not 'retweeted_status' in data: decoded = json.loads(data) write_txt = p.clean(decoded['text']) if 'extended_tweet' in data: try: write_txt = p.clean(decoded['extended_tweet']['full_text']) except: pass with open('data/streaming_tweets_save.csv', 'a', encoding='utf-8', newline='') as file: csvwriter = csv.writer(file) csvwriter.writerow([ decoded['id'], decoded['created_at'], write_txt, decoded['retweet_count'], decoded['favorite_count'], decoded['user']['screen_name'], decoded['user']['name'], decoded['user']['verified'], decoded['user']['followers_count'], decoded['user']['friends_count'], decoded['source'], decoded['user']['url'] ]) return True def on_error(self, status): print('\nERROR status = ', status)
def test_clean_urls(self): tweet = 'canbe foundathttp://www.osp.gatech.edu/rates/(http://www.osp.gatech.edu/rates/).' p.set_options(p.OPT.URL) cleaned_tweet = p.clean(tweet) self.assertEqual("canbe foundat.", cleaned_tweet) tweet = 'Nature:先日フランスで起きた臨床試験事故https://t.co/aHk5ok9CDg 原因究明まだなので早急な印象がするけど、低用量投与を1回' \ 'やった後で、(別のボランティアに)高用量の投与とかしてる試験方式にも問題があるだろうみたいなことを書いてる' cleaned_tweet = p.clean(tweet) self.assertEqual( 'Nature:先日フランスで起きた臨床試験事故 原因究明まだなので早急な印象がするけど、' '低用量投与を1回やった後で、(別のボランティアに)高用量の投与とかしてる試験方式にも問題があるだろうみたいなことを書いてる', cleaned_tweet) tweet = '[https://link.springer.com/article/10.1007/s10940\\-016\\-9314\\-9]' cleaned_tweet = p.clean(tweet) self.assertEqual('[]', cleaned_tweet) tweet = '(https://link.springer.com/article/10.1007/s10940-016-9314-9)' cleaned_tweet = p.clean(tweet) self.assertEqual('()', cleaned_tweet) tweet = 'check this link: https://fa.wikipedia.org/wiki/%D8%AD%D9%85%D9%84%D9%87_%D8%A8%D9%87_%DA%A9%D9%88%DB%8C' \ '_%D8%AF%D8%A7%D9%86%D8%B4%DA%AF%D8%A7%D9%87_%D8%AA%D9%87%D8%B1%D8%A7%D9%86_(%DB%B1%DB%B8%E2%80%93%DB%B2%' \ 'DB%B3_%D8%AA%DB%8C%D8%B1_%DB%B1%DB%B3%DB%B7%DB%B8) …' cleaned_tweet = p.clean(tweet) self.assertEqual('check this link: …', cleaned_tweet)
def clean_tweet(tweet): contents = tweet["text"].lower() # May want to change these prepro.set_options(prepro.OPT.URL, prepro.OPT.EMOJI, prepro.OPT.SMILEY, prepro.OPT.NUMBER) clean_contents = prepro.clean(contents) tweet["text"] = clean_contents
def is_one_canditate_mentioned(tweet): # TODO: Check that candiate is mentioned trumps_names = ["donald", "trump"] bidens_names = ["joe", "biden"] opponent_names = { "Donald Trump lang:en": bidens_names, "Trump lang:en": bidens_names, "Joe Biden lang:en": trumps_names, "Biden lang:en": trumps_names, } candidate_names = { "Donald Trump lang:en": trumps_names, "Trump lang:en": trumps_names, "Joe Biden lang:en": bidens_names, "Biden lang:en": bidens_names, } contents = tweet["text"].lower() prepro.set_options(prepro.OPT.URL, prepro.OPT.HASHTAG) clean_contents = prepro.clean(contents) for opponent_name in opponent_names[tweet["Candidate"]]: if clean_contents.find(opponent_name) != -1: return False for name in candidate_names[tweet["Candidate"]]: if clean_contents.find(name) != -1: return True return False
def test_clean_reserved_words(self): tweet = "Awesome!!! RT @RT: This is a tweet about art ART. FAV #RT #FAV #hashtag" p.set_options(p.OPT.RESERVED) cleaned_tweet = p.clean(tweet) self.assertEqual( 'Awesome!!! @RT: This is a tweet about art ART. #RT #FAV #hashtag', cleaned_tweet)
def _cleanTweet(self, tweet): # set preprocessor to remove links, mentions, and reserved words (FAV, RT, etc.) p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED) # clean tweet with preprocessor and remove unwanted symbols (hashtags, quotes, question marks) tweet = p.clean(tweet.translate(None, '#?"')) return tweet
def test_tokenize(self): tweet = "Packathon was a really #nice :) challenging 👌. @packathonorg http://packathon.org" p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY) tokenized_tweet = p.tokenize(tweet) self.assertEqual( tokenized_tweet, "Packathon was a really $HASHTAG$ $SMILEY$ challenging $EMOJI$. $MENTION$ $URL$" )
def clean_tweet(text): p.set_options(p.OPT.URL, p.OPT.MENTION) ps = PorterStemmer() text = p.clean(text) text = text.lower() text = text.translate(str.maketrans('', '', string.punctuation)) text = text.strip() text = word_tokenize(text) # Insert negative_word label for negative words text = [ negative_transformer(laugh_transformer(ps.stem(w))) for w in text if w not in words_to_remove ] # removing vowels (coooool -> cool) -> COMMENTED SINCE REDUCING PERFORMANCE # text = [remove_vowel(w) for w in text if not w in stop_words] # OR: another approach before tokenization tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet)) # Slang correction # TODO getting input file and create dictionary # Mispelled word correction -> using PyEnchant (not working on kernels) - bu return ' '.join(text)
def compareWithOriginal(tweet_database): orig_tweet = [] # stores all the original tweets used to train in orig_tweet list with open( './twitterScrubber/cleanData/new_FakeKenty_tweets_clean_train.txt', 'r', encoding='utf-8') as fp: tweet = fp.readline() while tweet: # Looks for actual tweets if tweet.strip() != "==========": orig_tweet.append(tweet) tweet = fp.readline() # compare with tweets generated from gpt-2 and take out duplicates num = 0 dup = 0 newlist = [] for tweet in tweet_database: dup_found = False for o in orig_tweet: if tweet == o: dup += 1 dup_found = True if dup_found == False: p.set_options(p.OPT.URL) tweet = p.clean(tweet) newlist.append(tweet) num += 1 print('...Looped through {}/{} tweets and found {} duplicates'.format( num, len(tweet_database), dup)) return newlist
def __init__(self, data_filepath=os.path.join('..', 'data', 'tweet_data.txt'), seed=3): self.seed = seed # self.tokenizer = AutoTokenizer.from_pretrained('distilroberta-base') self.label_encoder = OneHotEncoder(sparse=False) # Read in dataset df = pd.read_csv(data_filepath, header=None, names=['Tweet', 'Label'], delimiter='\t').dropna() data = df['Tweet'] # preprocess tweets to remove mentions, URL's p.set_options(p.OPT.MENTION, p.OPT.URL) # P.OPT.HASHTAG data = data.apply(p.clean) # Tokenize special Tweet characters # p.set_options(p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.RESERVED, p.OPT.NUMBER) # data = data.apply(p.tokenize) data = data.tolist() # One Hot encode labels labels = self.label_encoder.fit_transform(df['Label'].values.reshape( -1, 1)) # Split data self.train_X, self.test_X, self.train_Y, self.test_Y = train_test_split( data, labels, test_size=0.2, random_state=self.seed)
def preprocess_tweet(tweet): cleaned_tweet = tweet.lower() # lowercase the tweet p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG) # set options for the preprocessor cleaned_tweet = p.clean(cleaned_tweet) #cleaned_tweet = remove_stopwords(cleaned_tweet) # remove stopwords return cleaned_tweet
def clean_data(connection, df_tweets): """ A function that cleans tweets from URLs; Reserved keywords like RT,FAV; """ cursor = connection.cursor() for i in range(len(df_tweets)): print("ORIGINAL: ", df_tweets.loc[i, 'CONTENT']) id = df_tweets.loc[i, 'ID'] p.set_options(p.OPT.URL, p.OPT.RESERVED) cleaned_content = p.clean(df_tweets.loc[i, 'CONTENT']) #change the character " to ' to prevent quote error when writing to database cleaned_content = cleaned_content.replace('"', "'") df_tweets.at[i, 'CONTENT'] = cleaned_content print("CLEANED: ", df_tweets.at[i, 'CONTENT']) #write changes to database query = 'UPDATE GoldenSet SET CONTENT = "' + str( df_tweets.at[i, 'CONTENT']) + '" WHERE ID = ' + str(id) + ";" cursor.execute(query) print('-' * 40 + '\n') cursor.close() print( "\n\n\n----------------------- CLEANING DATA FINISHED -------------------------\n\n\n" )
def prerocess_tweets_texts(texts: pd.Series) -> pd.Series: """ Perform basic preprocessing before more elaborate preprocessing upon EDA. Parameters ---------- texts : pd.Series The texts of tweets to preprocess. Returns ------- pd.Series The texts of tweets processed. """ ret = [] # Remove URLs, emojis, mentions and smilies from tweets pptweet.set_options(pptweet.OPT.URL, pptweet.OPT.EMOJI, pptweet.OPT.MENTION, pptweet.OPT.SMILEY) for text in texts: ret.append(pptweet.clean(text)) return pd.Series(data=ret, index=texts.index)
def csv_read_and_write(read_path, write_path): with open(write_path, 'wb') as outFile: file_writer = csv.writer(outFile) i = 1 with open(read_path, 'r') as inFile: fileReader = csv.reader(inFile) for row in fileReader: tweet = row[4] print("raw tweet : " + tweet) decode = decode_tweet(tweet) if decode is not None: tweet = decode p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI) tweet = p.clean(tweet) print("semi-cleaned tweet : " + tweet) verdict = is_prayformarawi_tweet(tweet) print(verdict) data = [ row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9] ] if verdict is True: file_writer.writerow(data) i = i + 1 print("#prayformarawi tweets count: ", i)
def function_udf(input_str): input_str = re.sub(r'RT', '', input_str) p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION) input_str = p.clean(input_str) return ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", input_str).split())
def set_sentence_(sentence): p.set_options(p.OPT.URL, p.OPT.EMOJI) sentence=p.clean(sentence) sentence=hashtag_power(sentence) p.set_options(p.OPT.HASHTAG) sentence=p.clean(sentence) sentence=punc(sentence) sentence=Enleve_Accents(sentence) return sentence
def __init__(self): self.load_nltk() self.model = None self.abbreviations = None self.spell_check = None self.session = None preprocessor.set_options(preprocessor.OPT.URL, preprocessor.OPT.MENTION, preprocessor.OPT.RESERVED, preprocessor.OPT.EMOJI, preprocessor.OPT.SMILEY) self.load_db_tweets()
def test_parse(self): tweet = "A tweet with #hashtag :) @mention 😀 and http://github.com/s." p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY) parsed_tweet = p.parse(tweet) self.assertIsNotNone(parsed_tweet.urls) self.assertEqual(1, len(parsed_tweet.urls)) self.assertIsNotNone(parsed_tweet.hashtags) self.assertEqual(1, len(parsed_tweet.hashtags)) self.assertIsNotNone(parsed_tweet.mentions) self.assertEqual(1, len(parsed_tweet.mentions)) self.assertIsNone(parsed_tweet.reserved_words) self.assertIsNotNone(parsed_tweet.emojis) self.assertEqual(1, len(parsed_tweet.emojis)) self.assertEqual("😀", parsed_tweet.emojis[0].match) self.assertIsNotNone(parsed_tweet.smileys) self.assertEqual(1, len(parsed_tweet.smileys)) self.assertEqual(":)", parsed_tweet.smileys[0].match)
def test_clean(self): tweet = "Hello there! @pyistanbul #packathon was awesome 😀. http://packathon.org" p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY) cleaned_tweeet = p.clean(tweet) self.assertEqual(cleaned_tweeet, "Hello there! was awesome .")