def calculate_polarity_subjectivity(df): lm = ps.LM() hiv4 = ps.HIV4() polarity_array = [] subjectivity_array = [] hiv_polarity = [] hiv_subjectivity = [] count = 0 count1 = 0 for x in range(len(df['filteredtext'])): tokens_m = lm.tokenize(df['filteredtext'][x]) score_m = lm.get_score(tokens_m) polarity_array.append(score_m['Polarity']) subjectivity_array.append(score_m['Subjectivity']) tokens_hiv = hiv4.tokenize(df['filteredtext'][x]) score_hiv = hiv4.get_score(tokens_hiv) hiv_polarity.append(score_hiv['Polarity']) hiv_subjectivity.append(score_hiv['Subjectivity']) if score_m['Polarity'] * score_hiv['Polarity'] < 0: count += 1 feature_df = pd.DataFrame() feature_df['Mcdonald_Polarity'] = polarity_array feature_df['Mcdonald_Subjectivity'] = subjectivity_array feature_df['HIV_Polarity'] = hiv_polarity feature_df['HIV_Subjectivity'] = hiv_subjectivity feature_df['GloVe_embedding'] = get_glove_embeddings(df) return feature_df
def generate_sentiment(df): # using textblob for polarity # using LM dictionary # using HIV4 dictionary # using VADER lm = ps.LM() hiv4 = ps.HIV4() vader = SentimentIntensityAnalyzer() # VADER score df['VADER_compound'] = df['headline'].apply(lambda x: vader.polarity_scores(x)['compound']) # Harvard IV-4 dictionary - HIV4 score df['HIV4_score'] = df['headline'].apply(lambda x: hiv4.get_score(hiv4.tokenize(x))['Polarity']) # Loughran and McDonald Financial Sentiment Dictionary - LM score df['LM_score'] = df['headline'].apply(lambda x: lm.get_score(lm.tokenize(x))['Polarity']) # TextBlob polarity - identifies negation df['TextBlob_polarity'] = df['headline'].apply(lambda x: TextBlob(x).sentiment[0]) # TextBlob subjectivity df['TextBlob_subjectivity'] = df['headline'].apply(lambda x: TextBlob(x).sentiment[1]) df = df.drop_duplicates(subset='headline') del df['url'] del df['type'] return df
# https://journeys.autopilotapp.com/blog/email-spam-trigger-words/ spam = [ 'subscription', '.com', 'subsribe', 'urgent', 'instant', '100%', 'bonus', 'free', 'bargain', 'prize', 'deal', 'unlimited', 'access', 'boss', 'cancel', 'cheap', 'certified', 'cheap', 'compare', 'congratulations', 'cures', 'friend', 'guarantee', 'guaranteed', 'hello', 'offer', 'opportunity', 'winner', 'winning', 'won', 'amazing', 'billion', 'cash', 'earn', 'extra', 'home', 'lose', 'income', 'vacation', 'addresses', 'beneficiary', 'billiing', 'casino', 'celebrity', 'hidden', 'investment', 'junk', 'legal', 'loan', 'lottery', 'medicine', 'miracle', 'money', 'nigerian', 'offshore', 'passwords', 'refinance', 'request', 'rolex', 'score', 'spam', 'unsolicited', 'valium', 'viagra', 'vivodin', 'warranty', 'xanax' ] # Sentiment lm = ps.LM() list_scores = [] tokens = [lm.tokenize(email) for email in body] for i in range(0, len(df)): #tokens = lm.tokenize(body[i]) score = lm.get_score(tokens[i]) list_scores.append(score) # Count person1_list1 = [] exclusive_list1 = [] negative_list1 = [] action_list1 = [] length_list1 = [] work_list = [] spam_list = [] for i in range(0, len(df)):
def __init__(self): self.lm = ps.LM()