Esempio n. 1
0
def process_tweets(dftw):
    print('\tvik : In process tweets')

    #---drop some tweets---
    kywrds = ['tsla', 'tesla', 'elon', 'musk']
    badwords = [
        'n***a', 'nigger', 'gay', 'pussy', 'pussc', 'c**t', 'f**k', 'dick',
        'c**k', 'suck', 'w***e', 'pimp', 'wtf', 'asshole', 'bitch'
    ]

    #Get rid of any tweets which do not contain relevant keywords. Even though the API uses keywords to search
    #some tweets do not contain these keywords (like tsla, tesla) since the user name might contain these keywords.
    #Get rid of any tweets which are replies.

    for index, row in dftw.iterrows():
        txt = row['tweet']
        ctxt = tc.p.clean(txt)
        if (not any(wrd in ctxt.lower() for wrd in kywrds)) or (any(
                wrd in ctxt.lower() for wrd in badwords)) or (ctxt[0] == '@'):
            dftw.drop(index, axis=0, inplace=True)

    #Get rid of tweets with too many cashtags
    for index, row in dftw.iterrows():
        txt = row['tweet']
        ctxt = tc.p.clean(txt)
        if tc.count_cashtags(ctxt) > 0:
            dftw.drop(index, axis=0, inplace=True)

    #Get rid of tweets with zero words.
    for index, row in dftw.iterrows():
        txt = row['tweet']
        ctxt = tc.p.clean(txt)
        ctxt = tc.remove_mention(ctxt)
        ctxt = tc.remove_hashtag(ctxt)
        ctxt = tc.remove_cashtag(ctxt)
        if len(ctxt.split()) < 1:
            dftw.drop(index, axis=0, inplace=True)

    nrows = dftw.shape[0]

    dftw.reset_index(inplace=True, drop=True)

    if nrows > 0:
        dftw = predict_label(dftw)

    return dftw
Esempio n. 2
0
    # print(df[df.index.duplicated(keep=False)])

    df['senti'] = df['senti'].astype('category')

    # --------Clean text-----------

    # Delete tweets with more than 10 cashtags
    cond10 = df['text'].apply(
        lambda x: tc.count_cashtags(x) > cvars.cash_thresh)
    df.drop(index=df[cond10].index, inplace=True)
    print('\n\tdf.shape = ', df.shape)

    df['tidy_text'] = df['text'].apply(lambda x: tc.clean_emoji_url(x))
    df['tidy_text'] = df['tidy_text'].apply(lambda x: tc.remove_hashtag(x))
    df['tidy_text'] = df['tidy_text'].apply(lambda x: tc.remove_cashtag(x))
    df['tidy_text'] = df['tidy_text'].apply(lambda x: tc.remove_mention(x))
    df['tidy_text'] = df['tidy_text'].apply(lambda x: tc.replace_chars(x))
    df['tidy_text'] = df['tidy_text'].apply(lambda x: tc.normalize_doc(x))

    # Drop rows with empty tidy_text. After cleaning it is possible that all
    # the tokens in tidy_text get deleted. For example, the following tweet
    # after cleaning contains zero tokens.
    # $CUBE $EXR $HOG $KO $LSI $PSA $IRM https://t.co/GFZTPvIifx

    cond = df['tidy_text'].apply(lambda x: tc.count_toks(x) == 0)

    print('\n\tcond.shape = ', cond.shape)
    print('\n\tcond.value_counts = ', cond.value_counts())

    df.drop(index=df[cond].index, inplace=True)
Esempio n. 3
0
def sentiment_tweets():

    print('\n\t\tEntered sentiment_tweets : ')

    # Deserialize the model
    with open('data/bayes_fit.pkl', 'rb') as f:
        count_vect, model = pickle.load(f)

    # On entry (very first time the app is launched), make sure the file exists
    while True:
        if os.path.exists('data_tweets/streaming_tweets_save.csv'):
            break
        else:
            print('\t\tsentiment_tweets() : Sleeping')
            time.sleep(20)

    dftw = pd.DataFrame(columns=cvars.cols_senti)

    skiprows = 0
    while True:
        # Empty out dataframe, to be sure
        dftw.drop(dftw.index, inplace=True)

        # Read in only the latest tweets (Within last nsecs) using skiprows.
        dftw = pd.read_csv('data_tweets/streaming_tweets_save.csv',
                          names=cvars.cols, skiprows=skiprows)

        shp = dftw.shape

        cond_cash = dftw['tweet'].apply(lambda x: tc.count_cashtags(x) > cvars.cash_thresh)
        dftw.drop(index=dftw[cond_cash].index, inplace=True)

        # print('\n\t^^ skiprows = ',skiprows)
        # print('\n^^ shape = ',dftw.shape)

        if not dftw.empty:

            dftw['tidy_tweet'] = dftw['tweet'].apply(lambda x: tc.clean_emoji_url(x))
            dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(lambda x: tc.remove_hashtag(x))
            dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(lambda x: tc.remove_cashtag(x))
            dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(lambda x: tc.remove_mention(x))
            dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(lambda x: tc.replace_chars(x))
            dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(lambda x: tc.normalize_doc(x))

            cond = dftw['tidy_tweet'].apply(lambda x: tc.count_toks(x) == 0)
            dftw.drop(index=dftw[cond].index, inplace=True)

            if not dftw.empty:
                # iterrows is inefficient. However the number of rows being
                # processed is small.
                for indx, row in dftw.iterrows():
                    dftw.loc[indx, 'senti'] = model.predict(count_vect.transform([row['tidy_tweet']]))
                dftw['wt_senti'] = dftw.apply(
                                   lambda x:
                                   weighted_senti(x['senti'],
                                   x['retweet_count'] + x['favorite_count'],
                                   x['verified'],
                                   x['followers_count'] + x['friends_count']),
                                   axis=1)
                dftw[cvars.cols_display].to_csv('data_tweets/senti_tweets.csv',
                                          mode='a', header=False, index=False)

        skiprows += shp[0]
        time.sleep(cvars.nsecs)
Esempio n. 4
0
 def test_cashtag1(self):
     checkstr = tc.remove_cashtag(
         'Analysts See $0.31 EPS for CenterPoint Energy Inc. $CNP - https://t.co/NPuObzJS9E'
     )
     outstr = 'Analysts See .31 EPS for CenterPoint Energy Inc.  - https://t.co/NPuObzJS9E'
     self.assertEqual(checkstr, outstr)
Esempio n. 5
0
 def test_cashtag4(self):
     checkstr = tc.remove_cashtag(
         '$SEIC Advance Auto Parts $AAP PT Raised to $154 at RBC Capital; $10+ in EPS Power'
     )
     outstr = ' Advance Auto Parts  PT Raised to  at RBC Capital; + in EPS Power'
     self.assertEqual(checkstr, outstr)
Esempio n. 6
0
 def test_cashtag3(self):
     checkstr = tc.remove_cashtag(
         '$0.31 EPS Expected for CenterPoint Energy Inc. $CNP https://t.co/hcWeeUsBL4'
     )
     outstr = '.31 EPS Expected for CenterPoint Energy Inc.  https://t.co/hcWeeUsBL4'
     self.assertEqual(checkstr, outstr)
Esempio n. 7
0
 def test_cashtag2(self):
     checkstr = tc.remove_cashtag(
         'Dividend Champions With 20% Stock Price Potential -&gt; https://t.co/KBFzpewsL6 - $ORI $DOV $NUE $ABM $ITW $GD $NC $PH $SWK $PNR $SEIC'
     )
     outstr = 'Dividend Champions With 20% Stock Price Potential -&gt; https://t.co/KBFzpewsL6 -           '
     self.assertEqual(checkstr, outstr)