Beispiel #1
0
def calculate_polarity_subjectivity(df):
    lm = ps.LM()
    hiv4 = ps.HIV4()
    polarity_array = []
    subjectivity_array = []

    hiv_polarity = []
    hiv_subjectivity = []
    count = 0
    count1 = 0
    for x in range(len(df['filteredtext'])):
        tokens_m = lm.tokenize(df['filteredtext'][x])
        score_m = lm.get_score(tokens_m)
        polarity_array.append(score_m['Polarity'])
        subjectivity_array.append(score_m['Subjectivity'])

        tokens_hiv = hiv4.tokenize(df['filteredtext'][x])
        score_hiv = hiv4.get_score(tokens_hiv)

        hiv_polarity.append(score_hiv['Polarity'])
        hiv_subjectivity.append(score_hiv['Subjectivity'])
        if score_m['Polarity'] * score_hiv['Polarity'] < 0:
            count += 1

    feature_df = pd.DataFrame()
    feature_df['Mcdonald_Polarity'] = polarity_array
    feature_df['Mcdonald_Subjectivity'] = subjectivity_array

    feature_df['HIV_Polarity'] = hiv_polarity
    feature_df['HIV_Subjectivity'] = hiv_subjectivity

    feature_df['GloVe_embedding'] = get_glove_embeddings(df)

    return feature_df
def generate_sentiment(df):

    # using textblob for polarity
    # using LM dictionary
    # using HIV4 dictionary
    # using VADER

    lm = ps.LM()
    hiv4 = ps.HIV4()
    vader = SentimentIntensityAnalyzer()
    
    # VADER score
    df['VADER_compound'] = df['headline'].apply(lambda x: vader.polarity_scores(x)['compound'])

    # Harvard IV-4 dictionary - HIV4 score 
    df['HIV4_score'] = df['headline'].apply(lambda x: hiv4.get_score(hiv4.tokenize(x))['Polarity'])
    
    # Loughran and McDonald Financial Sentiment Dictionary - LM score
    df['LM_score'] = df['headline'].apply(lambda x: lm.get_score(lm.tokenize(x))['Polarity'])
    
    # TextBlob polarity - identifies negation
    df['TextBlob_polarity'] = df['headline'].apply(lambda x: TextBlob(x).sentiment[0])

    # TextBlob subjectivity
    df['TextBlob_subjectivity'] = df['headline'].apply(lambda x: TextBlob(x).sentiment[1])
    
    df = df.drop_duplicates(subset='headline')
    
    del df['url']
    del df['type']
    
    return df
Beispiel #3
0
# https://journeys.autopilotapp.com/blog/email-spam-trigger-words/
spam = [
    'subscription', '.com', 'subsribe', 'urgent', 'instant', '100%', 'bonus',
    'free', 'bargain', 'prize', 'deal', 'unlimited', 'access', 'boss',
    'cancel', 'cheap', 'certified', 'cheap', 'compare', 'congratulations',
    'cures', 'friend', 'guarantee', 'guaranteed', 'hello', 'offer',
    'opportunity', 'winner', 'winning', 'won', 'amazing', 'billion', 'cash',
    'earn', 'extra', 'home', 'lose', 'income', 'vacation', 'addresses',
    'beneficiary', 'billiing', 'casino', 'celebrity', 'hidden', 'investment',
    'junk', 'legal', 'loan', 'lottery', 'medicine', 'miracle', 'money',
    'nigerian', 'offshore', 'passwords', 'refinance', 'request', 'rolex',
    'score', 'spam', 'unsolicited', 'valium', 'viagra', 'vivodin', 'warranty',
    'xanax'
]
# Sentiment
lm = ps.LM()
list_scores = []
tokens = [lm.tokenize(email) for email in body]
for i in range(0, len(df)):
    #tokens = lm.tokenize(body[i])
    score = lm.get_score(tokens[i])
    list_scores.append(score)
# Count
person1_list1 = []
exclusive_list1 = []
negative_list1 = []
action_list1 = []
length_list1 = []
work_list = []
spam_list = []
for i in range(0, len(df)):
Beispiel #4
0
 def __init__(self):
     self.lm = ps.LM()