def vader_analysis(self, conn_, c, date_, delta_day, index): """Method that performs Sentiment analysis on news headlines. For a given day, the analysis is made from 9:30 am until 9:30 am (EST) the next day. For example, February 25th 2021 in the table would be analysis from news from 9:30am this morning until 9:29 am February 26th 2021. """ isValid = self.check_size(conn_, c, date_, delta_day, index) #if the sample is >= `self.min_sample`, performs sentiment analysis using Vader if isValid: list_results = [] c.execute( f"SELECT {self.news_header[2]} from {self.ticker} where {self.news_header[1]} >= " f"{self.start_debut_tempo} and {self.news_header[1]} < {self.end_date_tempo}" ) rows = c.fetchall() for row in rows: list_results.append(sia().polarity_scores(row[0])['compound']) #to make it easier, the sentiment score from previous day is on the same line than current day return #(index + 1 ) self.pd_data.loc[index + 1, self.sentiment_name] = mean(list_results) else: pass return self.pd_data
def on_status(self, status): """ When the client receives a tweet :param status: the tweet status """ try: if status._json["user"]["name"] not in company_username : if "extended_tweet" in status._json and "full_text" in status._json["extended_tweet"]: status._json["text"] = status._json["extended_tweet"]["full_text"] text = status._json["text"] ps = sia().polarity_scores(text) score = ps["compound"] logging.info(f"tweet : {score}") status._json['score'] = score # Finding the company the tweet is about status._json["company"] = "Unknown" if status._json["entities"]["user_mentions"] in company_username : status._json["company"] = dict_username[status._json["entities"]["user_mentions"]] else : for company in TARGET : if company in text : status._json["company"] = company self.producer.send( self.topic, value=status._json, ) except StopIteration as e: self.producer.close() running = False
def calculate_sentiment(): with open("jpmorgan.json") as json_file: for line in json_file: tweet = json.loads(line) if tweet['lang'] == 'en': tweet = tweet['full_text'].lower() current_sentiment = sia().polarity_scores(tweet) all_sentiment.append(current_sentiment['compound']) if current_sentiment['compound'] < 0: negative_tweets.append(tweet) elif current_sentiment['compound'] > 0: positive_tweets.append(tweet) if re.search('jpmcoin', tweet, flags=re.IGNORECASE): product_sentiment.append(current_sentiment['compound']) company_sentiment_mean = statistics.mean(all_sentiment) product_sentiment_mean = statistics.mean(product_sentiment) print('\nJP Morgan\'s mean sentiment score: ', company_sentiment_mean) print('JPM Coin\'s mean sentiment score: ', product_sentiment_mean) if product_sentiment_mean > company_sentiment_mean: print( "As you can see, the product's mean sentiment score is higher than the company's score.\n" ) else: print( "As you can see, the company's mean sentiment score is higher than the product's score.\n" )
def on_status(self, status): try: if status.coordinates != None and status.place.country_code == 'US' and status.retweeted == False: id_str = status.id_str # ID given to specific tweet name = status.user.screen_name # Name of Tweeter created = status.created_at # When the tweet was sent text = status.text # The tweet coords = status.coordinates # Coordinates from where the tweet was sent coords = json.dumps( coords ) # drops the coords from json to string, so it can be stored in sql # Add sentiment analysis sid = sia() polarity = sid.polarity_scores(text)[ "compound"] # Set sentiment score to "polarity" object if polarity != 0.0: #Store Tweets into SQLite db table = db["tweets"] table.insert( dict( tweet_id=id_str, user=name, tweet_datetime=created, text=text, sentement_score=polarity, coords=coords, )) print("Tweet Added", text) except TypeError: print('e') pass
def sentiment_processing(DataSet, senti_shape): split_article_content = [] for element in DataSet['plot']: split_article_content.append(re.split("(?<=[.!?])\s+", element)) sid = sia() senti_list = [] for i in range(len(split_article_content)): words = split_article_content[i] sentiment_com, sentiment_pos, sentiment_neg, sentiment_neu = [], [], [], [] script = [] for word in words: ss = sid.polarity_scores(word) sentiment_com.append(ss['compound']) sentiment_pos.append(ss['pos']) sentiment_neg.append(ss['neg']) sentiment_neu.append(ss['neu']) script.append(word) percentile_list = pd.DataFrame({ 'sentiment_sc': sentiment_com, 'sentiment_pos': sentiment_pos, 'sentiment_neg': sentiment_neg, 'sentiment_neu': sentiment_neu, 'script': script }) senti_list.append(percentile_list) sentiment_sc__ = [] for i in range(len(senti_list)): temp = [] for a in range(len(senti_list[i]["sentiment_sc"])): temp.append(senti_list[i]["sentiment_sc"][a]) sentiment_sc__.append(temp) def pad(l, content, width): zero_ = [content] * (width - len(l)) zero_.extend(l) return zero_ padding_ = [] for i in range(len(senti_list)): padding_.append(pad(sentiment_sc__[i], 0, senti_shape)) for i in range(0, len(padding_)): if len(padding_[i]) != senti_shape: print(len(padding_[i])) second_x = np.array(padding_) sentiment = second_x.reshape(len(padding_), senti_shape) Sentiment_DataSet = {'sentiment': sentiment} return Sentiment_DataSet
def sentiment(in_doc): """ Computer sentiment analysis markes for input Text. in_doc -- cleaned text """ sent_ana = sia() ps = sent_ana.polarity_scores(in_doc)['compound'] return ps
def __init__(self): self.dv = Doc2Vec.load("./models/doc2vec_model") self.tf = pickle.load(open("models/tfidf_model.pkl", "rb")) self.svd = pickle.load(open("models/svd_model.pkl", "rb")) self.svd_feature_matrix = pickle.load( open("models/lsa_embeddings.pkl", "rb")) self.doctovec_feature_matrix = pickle.load( open("models/doctovec_embeddings.pkl", "rb")) self.df = df = pd.read_pickle("perfume_data.pkl") self.hal = sia()
def featurizer(texts, company_name='apple'): ''' :param text: text in the form of a list of strings :param company_name: company name in string :return: featurized dictionary ''' text = [] for news in texts: text += word_tokenize(news) company_name = company_name.lower() increase = { 'increase', 'up', 'rise', 'jump', 'rose', 'high', 'beating', 'positive', 'gained', 'climbed', 'jumped', 'surged', 'rising', 'increased', 'soared', 'surging', 'skyrocketed', 'climb', 'climbing', 'gains', 'surge', 'grew', 'jumping' } decrease = { 'decrease', 'down', 'fall', 'plunge', 'low', 'negative', 'fell', 'lost', 'dropped', 'declined', 'tumbled', 'slipped', 'slumped', 'dipped', 'plunged', 'falling', 'slid', 'plummeted', 'sank', 'decline', 'dropping', 'tumbling' } # increase, decrease word mentioned in specific sentences or the entire text feature = { 'mentioned vs not mentioned': 0, 'company mentioned polarity scores': 0, 'total polarity scores': 0, 'word related to increase': 0, 'word related to decrease': 0, 'number of news': 0 } #'word related to increase': 0, 'word related to decrease':0 sid = sia() mentioned_sentences = keyword_mentioned_sentence(text, company_name) mentioned_sentence_scores = [ float(sid.polarity_scores(sent)['compound']) for sent in mentioned_sentences ] # polarity score list for mentioned sentences. txt = [word.lower() for word in text] fd = nltk.FreqDist(txt) company_mentioned = fd[company_name] company_not_mentioned = len(sent_tokenize( ' '.join(txt))) - company_mentioned increase_related_freq = [fd[word] / fd.N() for word in increase] decrease_related_freq = [fd[word] / fd.N() for word in decrease] feature[ 'mentioned vs not mentioned'] = 1 #company_mentioned/company_not_mentioned feature['company mentioned polarity scores'] = sum(mentioned_sentence_scores) if len(mentioned_sentences) == 0\ else sum(mentioned_sentence_scores)/len(mentioned_sentences) feature['total polarity scores'] = float( sid.polarity_scores(' '.join(text))['compound']) feature['word related to increase'] = sum(increase_related_freq) feature['word related to decrease'] = sum(decrease_related_freq) return feature
def SA(text): score = sia().polarity_scores(text) print(score) neg = score['neg'] pos = score['pos'] if neg > pos: print("Negative sentiment") elif pos > neg: print("Positive sentiment") else: print("Neutral vibe")
def adj_intensifier(tagged_text, adj_candid_index): intensifier_list = [] pairlist = [] sid = sia() for i in adj_candid_index: intensifier = tagged_text[i][0].strip('[\'\"]') word = tagged_text[i + 1][0].strip('[\'\"]') pair = word + ' ' + intensifier word_intensity = sid.polarity_scores(word)['compound'] pair_intensity = sid.polarity_scores(pair)['compound'] # 아래 if의 조건을 만족할 경우 현재 loop에서 돌고 있는 adjective는 intensifier의 역할을 한다. if abs(word_intensity) > 0 and abs(word_intensity) < abs(pair_intensity) and\ sign(word_intensity) == sign(pair_intensity): if tagged_text[i + 1][1] == 'NOUN': intensifier_list.append(intensifier.lower()) pairlist.append((intensifier.lower(), word.lower())) return intensifier_list, pairlist
def adv_intensifier(tagged_text, adv_candid_index): intensifier_list = [] pairlist = [] sid = sia() for i in adv_candid_index: intensifier = tagged_text[i][0].strip('[\'\"]') word_one = tagged_text[i - 1][0].strip('[\'\"]') word_two = tagged_text[i + 1][0].strip('[\'\"]') pair_one = word_one + ' ' + intensifier pair_two = intensifier + ' ' + word_two word_intensity = sid.polarity_scores(word_two)['compound'] pair_intensity = sid.polarity_scores(pair_two)['compound'] # 아래 if의 조건을 만족할 경우 현재 loop에서 돌고 있는 adverb는 intensifier의 역할을 한다. if abs(word_intensity) > 0 and abs(word_intensity) < abs(pair_intensity) and\ sign(word_intensity) == sign(pair_intensity): if tagged_text[i + 1] == 'VERB' or tagged_text[i + 1][1] == 'ADJ': intensifier_list.append(intensifier.lower()) pairlist.append((intensifier.lower(), word_two.lower())) return intensifier_list, pairlist
from nltk.probability import FreqDist from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia from glob import glob import re from string import punctuation as p import numpy def clean(words): # Function to clean out the metadata, tags, and headers clean_re = r'<p>(.*)' # noqa: E501 regex to find text within <p> tags only. Didn't take header text clean_text = re.findall(clean_re, words) # noqa: E501 use findall to get text and assign it join_clean = ' '.join(clean_text) # noqa: E501 what findall returns is a list of strings, so those strings need to be joined again return join_clean hal = sia() with open('Mini-CORE/1+IN+EN+IN-IN-IN-IN+EN-EN-EN-EN+WIKI+9992596.txt', 'r') as my_file: # noqa: E501 text = my_file.read().lower() clean_text = clean(text) sentences = nltk.sent_tokenize(clean_text) tokens = nltk.word_tokenize(clean_text) tokens_fd = FreqDist(tokens) # print(tokens[0:5]) # print(tokens_fd) # print(sentences[2]) word_len = [] for word in tokens: length = [len(word)] word_len.extend(length) average_length = numpy.mean(word_len)
encoding='utf8') as csvFile, open(destinationFileName, 'w', encoding='utf8') as reviewFile: line = csvFile.readline() line = csvFile.readline().replace('\n', '') header = 'tweets \n' reviewFile.write(header) while line != '': try: lineList = line.split(',') tweets = lineList[10] outputline = tweets + '\n' reviewFile.write(outputline) line = csvFile.readline().replace('\n', '') comment = tweets sid = sia() sentimentScores = sid.polarity_scores(comment) count += 1 except SyntaxError: print('No review found') if sentimentScores['compound'] > 0: countPos += 1 elif sentimentScores['compound'] < 0: countNeg += 1 elif sentimentScores['compound'] == 00: countNeu += 1 print('count =', count) print('Positive = ', countPos) print('Negative = ', countNeg) print('Neutral = ', countNeu) posPer = polarPecent(countPos, count)
label.append('Topic {}'.format([0, 1, 2, 3][i])) plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=cc, cmap=cm, marker='o', s=100) h1, = plt.plot(1, 1, color=colors[i], linewidth=3) h.append(h1) plt.legend(h, label, loc="upper left") plt.show() model = models.LdaModel(corpus, id2word=dictionary, num_topics=4) model.print_topics(4) ### ACCUMULATE FEELINGS from nltk.sentiment import SentimentAnalyzer from nltk.sentiment.util import * from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia sentim = sia() cc0 = [] for sentence in documents: cc0.append(sentim.polarity_scores(sentence)) neu = [] neg = [] for sentence in documents: ss = sentim.polarity_scores(sentence) for k in sorted(ss): print('{0}: {1}, '.format(k, ss[k]), end='') neg.append(ss[k]) neu.append(k) print() print('\n')
# -*- coding: utf-8 -*- """ Created on Thu Dec 6 10:02:20 2018 @author: Ben """ import numpy as np import matplotlib.pyplot as plt from glob import glob from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia analyzer = sia() files = glob('C:\\Users\\Ben\\Desktop\\F18_DIGHT360\\final-corpus\\*') pscore = [] stars = [] rank = ['1', '2', '3', '4', '5'] xs = range(len(rank)) for name in files: with open(name, encoding='utf8') as reviews: for review in reviews: star = review[0] if star in rank: stars.append(star) polarity = analyzer.polarity_scores(review) pscore.append(polarity['compound']) # print('.', end=' ', flush=True) else: continue index = np.arange(5) print('done processing files')
from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia import MySQLdb import sys exclude = [ 'Basilica', 'basilica', 'France', 'Paris', 'Montreal', ' de ', ' dans ', ' le ' ] analyzer = sia( lexicon_file='/var/www/html/cse30246/twinsight/project/vader_lexicon.txt') db = MySQLdb.connect("localhost", "mstaines", "#1Smarty", "twinsight") #Instantiate cursor c = db.cursor() w = db.cursor() #Run query, return 1 result c.execute("""SELECT tweetID, text FROM tweets2""") count = 0 row = c.fetchone() while row: sys.stdout.write("Row " + str(count) + "of less than 140000\r") text = row[1] if any(ex in text for ex in exclude): pass else: sent = analyzer.polarity_scores(text) w.execute("""UPDATE tweets2 SET sentiment = %s WHERE tweetID = %s""", ( sent['compound'], row[0],
# These bits of data will be in this format ---- ['sentiment score of paragraph', paragraph number] data_2018 = [] data_2017 = [] current_sentiment = [] all_sentiment = [] # Reset this variable every time a letter is done paragraph_count = 1 # Loop through the paragraphs in 2017 for paragraph in letter2017.split("\n"): if paragraph == '': continue all_sentiment = [] # Loop through the words in the paragraph for word in paragraph.split(): current_sentiment = sia().polarity_scores(word) all_sentiment.append(current_sentiment['compound']) # Add that paragraphs data to the 2017 data file data_2017.append([statistics.mean(all_sentiment), paragraph_count]) # This keeps track of where you are in the letter paragraph_count += 1 print(paragraph) # Creates a dataframe so we can plot the data later on df_2017 = pd.DataFrame(data_2017, columns=['Sentiment', 'Paragraph']) paragraph_count = 1 # Loop through the paragraphs in 2018 for paragraph in letter2018.split("\n"): if paragraph == '': continue all_sentiment = []
text chars_to_remove = ["\t","\n"] sc = set(chars_to_remove) text=''.join([c for c in text if c not in sc]) text sentences = sent_tokenize(text) sentences2=sentences sentences2 tokens = word_tokenize(text) tokens from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia sentim=sia() cc=[] for sentence in sentences2: cc.append(sentim.polarity_scores(sentence)) len(cc) len(sentences2) cc[0] neu=[] neg=[] for sentence in sentences2: ss = sentim.polarity_scores(sentence) for k in sorted(ss): print('{0}: {1}, '.format(k, ss[k]), end='') neg.append(ss[k])