def process_tweets(dftw): print('\tvik : In process tweets') #---drop some tweets--- kywrds = ['tsla', 'tesla', 'elon', 'musk'] badwords = [ 'n***a', 'nigger', 'gay', 'pussy', 'pussc', 'c**t', 'f**k', 'dick', 'c**k', 'suck', 'w***e', 'pimp', 'wtf', 'asshole', 'bitch' ] #Get rid of any tweets which do not contain relevant keywords. Even though the API uses keywords to search #some tweets do not contain these keywords (like tsla, tesla) since the user name might contain these keywords. #Get rid of any tweets which are replies. for index, row in dftw.iterrows(): txt = row['tweet'] ctxt = tc.p.clean(txt) if (not any(wrd in ctxt.lower() for wrd in kywrds)) or (any( wrd in ctxt.lower() for wrd in badwords)) or (ctxt[0] == '@'): dftw.drop(index, axis=0, inplace=True) #Get rid of tweets with too many cashtags for index, row in dftw.iterrows(): txt = row['tweet'] ctxt = tc.p.clean(txt) if tc.count_cashtags(ctxt) > 0: dftw.drop(index, axis=0, inplace=True) #Get rid of tweets with zero words. for index, row in dftw.iterrows(): txt = row['tweet'] ctxt = tc.p.clean(txt) ctxt = tc.remove_mention(ctxt) ctxt = tc.remove_hashtag(ctxt) ctxt = tc.remove_cashtag(ctxt) if len(ctxt.split()) < 1: dftw.drop(index, axis=0, inplace=True) nrows = dftw.shape[0] dftw.reset_index(inplace=True, drop=True) if nrows > 0: dftw = predict_label(dftw) return dftw
# print(df[df.index.duplicated(keep=False)]) df['senti'] = df['senti'].astype('category') # --------Clean text----------- # Delete tweets with more than 10 cashtags cond10 = df['text'].apply( lambda x: tc.count_cashtags(x) > cvars.cash_thresh) df.drop(index=df[cond10].index, inplace=True) print('\n\tdf.shape = ', df.shape) df['tidy_text'] = df['text'].apply(lambda x: tc.clean_emoji_url(x)) df['tidy_text'] = df['tidy_text'].apply(lambda x: tc.remove_hashtag(x)) df['tidy_text'] = df['tidy_text'].apply(lambda x: tc.remove_cashtag(x)) df['tidy_text'] = df['tidy_text'].apply(lambda x: tc.remove_mention(x)) df['tidy_text'] = df['tidy_text'].apply(lambda x: tc.replace_chars(x)) df['tidy_text'] = df['tidy_text'].apply(lambda x: tc.normalize_doc(x)) # Drop rows with empty tidy_text. After cleaning it is possible that all # the tokens in tidy_text get deleted. For example, the following tweet # after cleaning contains zero tokens. # $CUBE $EXR $HOG $KO $LSI $PSA $IRM https://t.co/GFZTPvIifx cond = df['tidy_text'].apply(lambda x: tc.count_toks(x) == 0) print('\n\tcond.shape = ', cond.shape) print('\n\tcond.value_counts = ', cond.value_counts()) df.drop(index=df[cond].index, inplace=True)
def sentiment_tweets(): print('\n\t\tEntered sentiment_tweets : ') # Deserialize the model with open('data/bayes_fit.pkl', 'rb') as f: count_vect, model = pickle.load(f) # On entry (very first time the app is launched), make sure the file exists while True: if os.path.exists('data_tweets/streaming_tweets_save.csv'): break else: print('\t\tsentiment_tweets() : Sleeping') time.sleep(20) dftw = pd.DataFrame(columns=cvars.cols_senti) skiprows = 0 while True: # Empty out dataframe, to be sure dftw.drop(dftw.index, inplace=True) # Read in only the latest tweets (Within last nsecs) using skiprows. dftw = pd.read_csv('data_tweets/streaming_tweets_save.csv', names=cvars.cols, skiprows=skiprows) shp = dftw.shape cond_cash = dftw['tweet'].apply(lambda x: tc.count_cashtags(x) > cvars.cash_thresh) dftw.drop(index=dftw[cond_cash].index, inplace=True) # print('\n\t^^ skiprows = ',skiprows) # print('\n^^ shape = ',dftw.shape) if not dftw.empty: dftw['tidy_tweet'] = dftw['tweet'].apply(lambda x: tc.clean_emoji_url(x)) dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(lambda x: tc.remove_hashtag(x)) dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(lambda x: tc.remove_cashtag(x)) dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(lambda x: tc.remove_mention(x)) dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(lambda x: tc.replace_chars(x)) dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(lambda x: tc.normalize_doc(x)) cond = dftw['tidy_tweet'].apply(lambda x: tc.count_toks(x) == 0) dftw.drop(index=dftw[cond].index, inplace=True) if not dftw.empty: # iterrows is inefficient. However the number of rows being # processed is small. for indx, row in dftw.iterrows(): dftw.loc[indx, 'senti'] = model.predict(count_vect.transform([row['tidy_tweet']])) dftw['wt_senti'] = dftw.apply( lambda x: weighted_senti(x['senti'], x['retweet_count'] + x['favorite_count'], x['verified'], x['followers_count'] + x['friends_count']), axis=1) dftw[cvars.cols_display].to_csv('data_tweets/senti_tweets.csv', mode='a', header=False, index=False) skiprows += shp[0] time.sleep(cvars.nsecs)
def test_cashtag1(self): checkstr = tc.remove_cashtag( 'Analysts See $0.31 EPS for CenterPoint Energy Inc. $CNP - https://t.co/NPuObzJS9E' ) outstr = 'Analysts See .31 EPS for CenterPoint Energy Inc. - https://t.co/NPuObzJS9E' self.assertEqual(checkstr, outstr)
def test_cashtag4(self): checkstr = tc.remove_cashtag( '$SEIC Advance Auto Parts $AAP PT Raised to $154 at RBC Capital; $10+ in EPS Power' ) outstr = ' Advance Auto Parts PT Raised to at RBC Capital; + in EPS Power' self.assertEqual(checkstr, outstr)
def test_cashtag3(self): checkstr = tc.remove_cashtag( '$0.31 EPS Expected for CenterPoint Energy Inc. $CNP https://t.co/hcWeeUsBL4' ) outstr = '.31 EPS Expected for CenterPoint Energy Inc. https://t.co/hcWeeUsBL4' self.assertEqual(checkstr, outstr)
def test_cashtag2(self): checkstr = tc.remove_cashtag( 'Dividend Champions With 20% Stock Price Potential -> https://t.co/KBFzpewsL6 - $ORI $DOV $NUE $ABM $ITW $GD $NC $PH $SWK $PNR $SEIC' ) outstr = 'Dividend Champions With 20% Stock Price Potential -> https://t.co/KBFzpewsL6 - ' self.assertEqual(checkstr, outstr)