def _analyze_sentiment(self): """Compute the mean sentiment of the Message's sentences.""" # Todo: If user mentions a known restaurant by name, treat that message as a Review. self._tokenize() # populate the sentences array sentiments_sum = 0 # sum of vaderSentiment SentimentIntensityAnalyzer "compound" scores analyzer = vs.SentimentIntensityAnalyzer() for sentence in self._sentences: sentence_sentiment = analyzer.polarity_scores(sentence)["compound"] sentiments_sum += sentence_sentiment for word in sentence.split(): # todo time complexity needlessly bad, VSA already made one pass. Subclass VSA into a custom MessageAnalyzer and change just the one method # by making it also check for the keywords. # todo implement binary search word = word.lower().strip() # todo refactor this preprocessing to a helper while not word[-1].isalpha(): # strip ending punctuation word = word[:-1] # Todo should we search character-wise to deal with plurals? Or always strip "s" from plurals? if self._bsearch_taste_keywords(word): self.sender.update_tastes(taste = word, strength = sentence_sentiment) # Todo improve the business logic. Right now, this merely treats the sentiment of the # sentence in which the word appeared as the user's sentiment toward that taste. # todo update the sender user object literal's tastes data. MessageModelInterface is responsible for writing the changes to both the # message data and the user data. self._sentiment_avg = round(sentiments_sum / len(self._sentences), SENTIMENT_DECIMAL_PLACES) return self._sentiment_avg # Todo consider an ABC for the text-based objects--reviews and messages. Lot of reusable operations.
def func(): st = request.form["review"] if (st == ''): return render_template('index.html') english = spacy.load("en_core_web_sm") result = english(st) sentences = [str(s) for s in result.sents] analyzer = vaderSentiment.SentimentIntensityAnalyzer() sentiment = [analyzer.polarity_scores(str(s)) for s in sentences] if (sentiment[0]['compound'] >= 0.05): sent = "Positive " emoji = 128512 address = ' https://st.depositphotos.com/1016482/2236/i/950/depositphotos_22362437-stock-photo-background-with-heap-of-yellow.jpg' elif (sentiment[0]['compound'] <= -0.05): sent = "Negative " emoji = 128577 address = 'https://www.ecopetit.cat/wpic/mpic/270-2706765_sad-emoji-cover-photo-for-fb.jpg ' else: sent = "Neutral " emoji = 128528 address = 'https://atlas-content-cdn.pixelsquid.com/stock-images/neutral-face-facial-expression-L63Mrq1-600.jpg ' return render_template('output.html', sentence=st, sent=sent, emoji=emoji, address=address)
class VaderAnalyzer(Analyzer): analyzer = vader.SentimentIntensityAnalyzer() def __init__(self): pass def get_value(self, word): """ Get value of a single word """ wordlist = self.validate(word) if len(wordlist) > 1: raise ValueError("get_value takes only a single word!") return self.analyzer.polarity_scores(wordlist[0])['compound'] def get_values(self, textlist): """ Get values of a list of strings textlist (list) : list of strings, input sentences Returns: vals (list) : list of floats, sentiment for each input sentence """ vals = [] for text in textlist: vals.append(self.analyzer.polarity_scores(text)['compound']) return vals
def vader_analyze(twitter_input): analyzer = vS.SentimentIntensityAnalyzer() pos = [] neg = [] neu = [] com = [] # VADER analysis for tweet in twitter_input: analyzed_tweet = analyzer.polarity_scores(tweet) pos.append(analyzed_tweet['pos']) neg.append(analyzed_tweet['neg']) neu.append(analyzed_tweet['neu']) com.append(analyzed_tweet['compound']) # Average calculation ave_pos = sum(pos) / float(len(pos)) ave_neu = sum(neu) / float(len(neu)) ave_neg = sum(neg) / float(len(neg)) # Setting up Plot.ly graphing labels = ['Positive', 'Neutral', 'Negative'] values = [ave_pos, ave_neu, ave_neg] trace = Pie(labels=labels, values=values) data = [Histogram(x=com)] # Creates Plot.ly graphs and stores them in tuple as strings. The graphs are saved as HTML divs divs = [ plotly.offline.plot([trace], include_plotlyjs=False, output_type='div'), # Pie Chart plotly.offline.plot(data, include_plotlyjs=False, output_type='div') ] # Histogram return divs
def getVaderscores(self): '''gets tweets from movie_tweets, calculates vader scores and stores them back in movie_tweets''' logging.info("----Calculating VADER Sentiment Scores----") tweets = self.movie_tweets['vaderTweet'] vader = vaderSentiment.SentimentIntensityAnalyzer() sentScores_neg = [] sentScores_neu = [] sentScores_pos = [] sentScores_cpd = [] for tweet in tqdm(tweets): doc = self.nlp(tweet) sentScore = vader.polarity_scores(str(doc)) sentScores_neg.append(sentScore['neg']) sentScores_neu.append(sentScore['neu']) sentScores_pos.append(sentScore['pos']) sentScores_cpd.append(sentScore['compound']) scores_df = pd.DataFrame({ 'neg': sentScores_neg, 'neu': sentScores_neu, 'pos': sentScores_pos, 'cpd': sentScores_cpd }) self.movie_tweets = pd.concat([self.movie_tweets, scores_df], axis=1) logging.info("----Scored Appended to Processed Dataframe----")
def main(args): spark = sql.SparkSession.builder.appName('update-mutator').getOrCreate() msg_struct = types.StructType([ types.StructField('text', types.StringType(), True), types.StructField('user_id', types.StringType(), True), types.StructField('update_id', types.StringType(), True) ]) sentiments_struct = types.ArrayType( types.MapType(types.StringType(), types.FloatType(), False)) analyzer = vader.SentimentIntensityAnalyzer() analyzer_bcast = spark.sparkContext.broadcast(analyzer) def sentiment_generator_impl(text): va = analyzer_bcast.value english = SpacyMagic.get('en_core_web_sm') result = english(text) sents = [str(sent) for sent in result.sents] sentiment = [va.polarity_scores(str(s)) for s in sents] return sentiment sentiment_generator = functions.udf(sentiment_generator_impl, sentiments_struct) def json_converter_impl(user_id, update_id, text, sentiments): obj = dict(user_id=user_id, update_id=update_id, text=text, sentiments=sentiments) return json.dumps(obj) json_converter = functions.udf(json_converter_impl, types.StringType()) records = (spark.readStream.format('kafka').option( 'kafka.bootstrap.servers', args.brokers).option('subscribe', args.intopic).load().select( functions.column('value').cast(types.StringType()).alias('value') ).select( functions.from_json( functions.column('value'), msg_struct).alias('json')).select( functions.column('json.user_id'), functions.column('json.update_id'), functions.column('json.text'), sentiment_generator(functions.column('json.text')).alias( 'sentiments')).select( json_converter(functions.column('user_id'), functions.column('update_id'), functions.column('text'), functions.column('sentiments')). alias('value')).writeStream.format('kafka').option( 'kafka.bootstrap.servers', args.brokers).option('topic', args.outtopic).option( 'checkpointLocation', '/tmp').start()) records.awaitTermination()
def _analyze_sentiment(self) -> float: self._tokenize() analyzer = vs.SentimentIntensityAnalyzer() sentiments_sum = 0 # sum of VSA "compound" scores for sentence in self._sentences: sentiments_sum += analyzer.polarity_scores(sentence)["compound"] sentiments_mean = round(sentiments_sum / len(self._sentences), SENTIMENT_DECIMAL_PLACES) # VSA demo rounds to 4 decimal places self._sentiment = sentiments_mean return self._sentiment
def getNewsSentiment(source): b = vaderSentiment.SentimentIntensityAnalyzer() news_list = news2.getNews(source) sentiment = 0 for each in news_list: scores = b.polarity_scores(each) sentiment += (scores['pos'] - scores['neg']) return sentiment, news_list
def get_vader(video_id): comments = get_comments(video_id) analyzer = vaderSentiment.SentimentIntensityAnalyzer() vaders = [] for comment in comments: vs = comment['message'], analyzer.polarity_scores(comment['message']) vaders.append(vs) return vaders
def sentiment(data): sentiment = [] analyzer = vd.SentimentIntensityAnalyzer() for i in data: result = analyzer.polarity_scores(i)["compound"] sentiment.append(result) if len(sentiment) == 0: return 0 else: return sum(sentiment) / len(sentiment)
def main(args): spark = sql.SparkSession.builder.appName('update-analyzer').getOrCreate() msg_struct = types.StructType([ types.StructField('text', types.StringType(), True), types.StructField('user_id', types.StringType(), True), types.StructField('update_id', types.StringType(), True) ]) analyzer = vader.SentimentIntensityAnalyzer() analyzer_bcast = spark.sparkContext.broadcast(analyzer) vhost_bcast = args.vhost vport_bcast = args.vport def sentiment_generator_impl(text, user_id, update_id): va = analyzer_bcast.value english = SpacyMagic.get('en_core_web_sm') result = english(text) sents = [str(sent) for sent in result.sents] sentiments = [va.polarity_scores(str(s)) for s in sents] obj = dict(user_id=user_id, update_id=update_id, text=text, sentiments=sentiments) try: con = httplib.HTTPConnection(host=vhost_bcast, port=vport_bcast) con.request('POST', '/', body=json.dumps(obj)) con.close() except Exception as e: logging.warn('unable to POST to visualizer, error:') logging.warn(e.message) sentiment_generator = functions.udf(sentiment_generator_impl, types.NullType()) records = (spark.readStream.format('kafka').option( 'kafka.bootstrap.servers', args.brokers).option('subscribe', args.topic).load().select( functions.column('value').cast( types.StringType()).alias('value')).select( functions.from_json( functions.column('value'), msg_struct).alias('json')).select( functions.column('json.user_id'), functions.column('json.update_id'), functions.column('json.text'), sentiment_generator( functions.column('json.text'), functions.column('json.user_id'), functions.column('json.update_id'))). writeStream.format("console").start()) records.awaitTermination()
def sent_analyze(sentence): """Analyze a given sentence/block of text.""" english = spacy.load("en_core_web_sm") # load spacy lang # nlp = en_core_web_sm.load() result = english(sentence) sentences = [str(s) for s in result.sents] # go thru sentences analyzer = (vaderSentiment.SentimentIntensityAnalyzer() ) # create analyzer using vaderSentiment sentiment = [analyzer.polarity_scores(str(s)) for s in sentences] # calculate sentiment rating return sentiment
def analyzer_cols(row): # INITIALIZE VADER analyzer = vd.SentimentIntensityAnalyzer() # RETRIEVE SCORES scores = analyzer.polarity_scores(row['QUOTE']) row['neg'] = scores['neg'] row['neu'] = scores['neu'] row['pos'] = scores['pos'] row['compound'] = scores['compound'] return row
def sentiment(data): analyzer = vaderSentiment.SentimentIntensityAnalyzer() compound = [] positive = [] neutral = [] negative = [] for t in data['tweet']: vs = analyzer.polarity_scores(str(t)) compound.append(vs['compound']) positive.append(vs['pos']) neutral.append(vs['neu']) negative.append(vs['neg']) data['compound'] = compound data['positive'] = positive data['neutral'] = neutral data['negative'] = negative return data
def is_threat(text, lat, lon): """ Checks to see if sentiment is threating, related to Merck and within range of a facility :param: raw text :param: latitude :param: longitude :return: response body if nonthreat or null if it is a threat """ if not filterText(text): return buildResponse(body=json.dumps( {"message": "Not related to Merck or its interests"}), status=200) analyzer = vaderSentiment.SentimentIntensityAnalyzer() sentiment = analyzer.polarity_scores(text) neg = False if sentiment['neg'] > 0.5: neg = True elif sentiment['pos'] < 0.1 and sentiment['neg'] > 0.3: neg = True # if sentiment is neutral or positive we dont want to waste # finite db space if not neg: return buildResponse(body=json.dumps( {"message": "Nonnegative sentiment"}), status=200) # check to see if the threat is in the range of a Merck Facility if not inRangeOfMerckFacility(lat, lon): return buildResponse(body=json.dumps( {"message": "Not in range of a facility"}), status=200) return None
def sentiment_return(request): data = json.loads(request.body) print(data) q = data["query"] month = "01" day = "01" year = "2020" count = "2" # function that pulls tweets def get_tweets(): # twitter dev credentials here: consumer_key = secrets.ck1 consumer_secret = secrets.cs1 access_token = secrets.at1 access_token_secret = secrets.ats1 auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth,wait_on_rate_limit=True, wait_on_rate_limit_notify=True) # open/create a file to append data # csvFile = open('tweets.csv', 'a') # # use csv Writer # csvWriter = csv.writer(csvFile) # compile tweets csv to be analyzed tweet_array = [] for tweet in tweepy.Cursor(api.search,q=f"{q}",count=f"{count}", lang="en", since=f"{year}-{month}-{day}").items(): # print (tweet.created_at, tweet.text) tweet_array.append({"created": tweet.created_at, "body": tweet.text.encode('utf-8')}) # csvWriter.writerow([tweet.created_at, tweet.text.encode('utf-8')]) return tweet_array # call get_tweets() tweet_list = get_tweets() # sentiment analysis function analyzer = vader.SentimentIntensityAnalyzer() english = spacy.load("en_core_web_sm") # define get_sentiments function to process tweets def get_sentiments(text_list): text = "\n".join([str(tweet["body"]) for tweet in text_list]) result = english(text) # print(result) sentences = [str(sent) for sent in result.sents] sentiments = [analyzer.polarity_scores(str(s)) for s in sentences] return sentiments # define analyze_tweets function def analyze_tweets(tweet_list): # open and analyze sentiment of tweets # data = open('tweets.csv', 'r') # text = data.read() text = tweet_list sentiments = get_sentiments(text) # open/create a file to append data csvFile = open('sentiment.csv', 'a') fieldnames = ["neg", "neu", "pos", "compound"] # use csv Writer csvWriter = csv.DictWriter(csvFile, fieldnames=fieldnames) # compile tweets csv to be analyzed csvWriter.writeheader() for sent in sentiments: # print(sent) csvWriter.writerow(sent) analyze_tweets(tweet_list) def find_sent_mean(): df = pd.read_csv("sentiment.csv") mean = df.mean() mean = mean.drop(["neu", "compound"]) neg = float(mean[0]) * 100 pos = float(mean[1]) * 100 neg = format(neg,'.1f') pos = format(pos,'.1f') print("\n----------- Tweet Sentiment -----------\n") print(f"Negative: {neg}\nPositive: {pos}") os.remove("sentiment.csv") return neg, pos # call find_sent_mean function neg, pos = find_sent_mean() # define generate_wordcloud function def generate_wordcloud(tweet_list): # define now for naming wordcloud.png now = datetime.now() # create wordcloud from tweet_list # remove stopwords & irrelevant phrases WordCloud(width=800, height=400, background_color="white", max_words=5000, contour_width=3, contour_color="steelblue").generate_from_text(" ".join([r for _d in tweet_list for r in _d['body'].decode('utf-8').replace('https', "").replace('photo', '').replace('RT', '').replace('co', '').split() if r not in set(nltk.corpus.stopwords.words("english"))])).to_file("static/wordcloud.png") generate_wordcloud(tweet_list) return JsonResponse({ "neg":neg, "pos":pos, "q":q, })
def main(): emojiPattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U0001F100-\U0001F1FF" u"\U0001F780-\U0001F999" u"\u2000-\u206F" u"\u2701-\u27BF" "]+", flags=re.UNICODE) fileName = "cleanedTweets.json" outputFileName = "finalTweets.json" file = open(fileName, "r") languages = getLanguages("Languages.txt") print(languages) #cleanTweets(fileName, outputFileName) translator = Translator() #with open("dictFile.json") as jsonData: #regionsToSentiments = json.load(jsonData) #regionsToSentiments = json.loads(file) regionsToSentiments = {} workingPath = "C:\\Users\\Taylor\\Desktop\\TEST\\Countries" analyzer = vaderSentiment.SentimentIntensityAnalyzer() usa = "US_EN.json" #for root, directories, files in os.walk(workingPath): #for file in files: tweet = 0 realFile = open(usa, "r") for line in realFile: #print(file) tweet += 1 print(tweet) tweetObject = json.loads(line) # print(tweetObject) originalTweet = tweetObject["text"] #originalTweet = emojiPattern.sub(r'', originalTweet) # print(originalTweet) # originalTweet = ''.join(character for character in tweetObject["text"] if character not in emoji.UNICODE_EMOJI) ''' if tweetObject["lang"] != "en": # print("Original Tweet:" + str(tweetObject["text"])) # print(tweetObject["lang"]) if tweetObject["lang"] in languages: # Translates the tweet from its native language to English. translatedTweet = translator.translate(originalTweet, dest="en", src=tweetObject["lang"]).text # print("Translated Tweet: " + str(translatedTweet)) else: transObject = translator.translate(originalTweet, dest="en") translatedTweet = transObject.text # print("Translated Tweet: " + str(translatedTweet)) else: translatedTweet = tweetObject["text"] ''' sentiment = analyzer.polarity_scores(originalTweet) coordinates = ( tweetObject["place"]["bounding_box"]["coordinates"][0][0][0], tweetObject["place"]["bounding_box"]["coordinates"][0][0][1]) # print(sentiment) # print(coordinates) if tweetObject["place"]["country_code"] not in regionsToSentiments: regionsToSentiments[tweetObject["place"]["country_code"]] = {} if tweetObject["created_at"].split( )[3][0:2] not in regionsToSentiments[tweetObject["place"] ["country_code"]]: regionsToSentiments[tweetObject["place"]["country_code"]][ tweetObject["created_at"].split()[3][0:2]] = [ (sentiment, coordinates) ] else: regionsToSentiments[tweetObject["place"]["country_code"]][ tweetObject["created_at"].split()[3][0:2]].append( (sentiment, coordinates)) else: if tweetObject["created_at"].split( )[3][0:2] not in regionsToSentiments[tweetObject["place"] ["country_code"]]: regionsToSentiments[tweetObject["place"]["country_code"]][ tweetObject["created_at"].split()[3][0:2]] = [ (sentiment, coordinates) ] else: regionsToSentiments[tweetObject["place"]["country_code"]][ tweetObject["created_at"].split()[3][0:2]].append( (sentiment, coordinates)) print() print(regionsToSentiments) saveDict(regionsToSentiments, "dictFile.json")
messages.show(2) +--------------------+------+ | Text|Target| +--------------------+------+ |I have bought sev...| 1| |"Product arrived ...| 0| +--------------------+------+ only showing top 2 rows In [42]: messages.count() Out[42]: 525814 Sentiment Analysis using VaderSentiment library In [6]: sentiment = messages.rdd.map(lambda x: [x[1],vaderSentiment.SentimentIntensityAnalyzer().polarity_scores(x[0][0:140])]) In [7]: sentiment.cache() sentiment.take(2) Out[7]: [[u'1', {'compound': 0.7902, 'neg': 0.0, 'neu': 0.734, 'pos': 0.266}], [u'0', {'compound': -0.1027, 'neg': 0.104, 'neu': 0.808, 'pos': 0.088}]] In [9]: sentiment_DF = sentiment.map(lambda x: (x[0],x[1]['compound'],x[1]['neg'],x[1]['neu'],x[1]['pos'])).toDF() In [ ]: sentiment_pandas = sentiment_DF.toPandas() In [83]: # importing the data sentiment_pandas = pd.read_csv('D:/Divya/Fall/6330 Harvesting Big Data/FinalProject/Reviews.csv') #Sampling
def __init__(self): self.analyser = vaderSentiment.SentimentIntensityAnalyzer()
def google_businfo(self, return_hours=False): """ Parse the responses from get_details() and get_distance_time() into a dictionary for each Google Maps Place ID. """ #TODO: Find a simpler way to return hours of operation data. dict_businfo = { "place_id": self.place_ids, "phone_formatted": [], "phone": [], "site": [], "distance": [], "trip_time": [], "goog_sent_pos": [], "goog_sent_neu": [], "goog_sent_neg": [], "goog_sent_comp": [], "goog_rev_avgrating": [] } sent_scorer = vs.SentimentIntensityAnalyzer() for _id in self.place_ids: details = self.google_busdetails(place_id=_id) dist_time = self.distance_time(place_id=_id) dict_businfo["phone_formatted"].append( details["result"]["formatted_phone_number"] if "formatted_phone_number" in details["result"].keys() else None) dict_businfo["phone"].append( details["result"]["international_phone_number"] if "formatted_phone_number" in details["result"].keys() else None) dict_businfo["phone"] = [ re.sub(pattern=r"[^\d\+]", repl="", string=x) if x is not None else None for x in dict_businfo["phone"] ] dict_businfo["site"].append( details["result"]["website"] if "website" in details["result"].keys() else None) dict_businfo["distance"].append(dist_time[0]) dict_businfo["trip_time"].append(dist_time[1]) if "reviews" not in details["result"].keys(): for k in [ "goog_sent_pos", "goog_sent_neu", "goog_sent_neg", "goog_sent_comp", "goog_rev_avgrating" ]: dict_businfo[k].append(None) else: reviews = details["result"]["reviews"] sent_scores = [ sent_scorer.polarity_scores(r["text"]) for r in reviews ] dict_businfo["goog_sent_pos"].append( sum(s["pos"] for s in sent_scores) / len(sent_scores)) dict_businfo["goog_sent_neu"].append( sum(s["neu"] for s in sent_scores) / len(sent_scores)) dict_businfo["goog_sent_neg"].append( sum(s["neg"] for s in sent_scores) / len(sent_scores)) dict_businfo["goog_sent_comp"].append( sum(s["compound"] for s in sent_scores) / len(sent_scores)) dict_businfo["goog_rev_avgrating"].append( sum(r["rating"] for r in reviews) / len(reviews)) if return_hours: sun_open, mon_open, tue_open, wed_open, thur_open, \ fri_open, sat_open, sun_close, mon_close, tue_close, \ wed_close, thur_close, fri_close, sat_close = [], [], [], [], [], [], [], [], [], [], [], [], [], [] opens = [ sun_open, mon_open, tue_open, wed_open, thur_open, fri_open, sat_open ] closes = [ sun_close, mon_close, tue_close, wed_close, thur_close, fri_close, sat_close ] day_idx = { "sun": 0, "mon": 1, "tue": 2, "wed": 3, "thur": 4, "fri": 5, "sat": 6 } for day in day_idx.keys(): try: opens[day_idx[day]].append( details["result"]["opening_hours"]["periods"][ day_idx[day]]["open"]["time"]) except: opens[day_idx[day]].append(None) try: closes[day_idx[day]].append( details["result"]["opening_hours"]["periods"][ day_idx[day]]["close"]["time"]) except: closes[day_idx[day]].append(None) for day in day_idx.keys(): dict_businfo["opens_" + day] = opens[day_idx[day]] dict_businfo["closes_" + day] = closes[day_idx[day]] return dict_businfo
# -*- coding: utf-8 -*- # encoding=utf8 import sys, json, re import numpy as np from vaderSentiment import vaderSentiment as vs analyzer = vs.SentimentIntensityAnalyzer() reload(sys) sys.setdefaultencoding('utf8') affect_list = ["anger", "fear", "joy", "sadness"] import os cwd = os.getcwd() print cwd with open( os.path.join(cwd, 'dataset', 'task1', 'train', 'dataset_json_task_1.txt')) as data_file: train_data = json.load(data_file) with open( os.path.join(cwd, 'dataset', 'task1', 'development', 'dataset_json_development.txt')) as data_file: development_data = json.load(data_file) with open( os.path.join(cwd, 'dataset', 'task1', 'test', 'dataset_json_test.txt')) as data_file: test_data = json.load(data_file) test_data = development_data import sklearn.ensemble, sklearn.metrics #, sklearn.cross_validation from sklearn.metrics import mean_squared_error, r2_score import scipy import math
def yelp_sentiment(self, id_match_list): dict_sentiment = { "place_id": [], "yelp_id": [], "yelp_sent_pos": [], "yelp_sent_neu": [], "yelp_sent_neg": [], "yelp_sent_comp": [], "yelp_rev_avgrating": [] } sent_scorer = vs.SentimentIntensityAnalyzer() for _ids in id_match_list: if "yelp_id" in _ids.keys(): endpoint_reviews = "/".join( [self.endpoint_base, _ids["yelp_id"], "reviews"]) getrequest = requests.get(url=endpoint_reviews, headers=self.headers) try: getresp = getrequest.json()["reviews"] #Yelp review text snippets end in trailing elipsis, hence the [:-3] slice notation. texts = [r["text"][:-3] for r in getresp] dict_sentiment["place_id"].append(_ids["place_id"]) dict_sentiment["yelp_id"].append(_ids["yelp_id"]) dict_sentiment["yelp_sent_pos"].append( sum([ sent_scorer.polarity_scores(t)["pos"] for t in texts ]) / len(texts)) dict_sentiment["yelp_sent_neu"].append( sum([ sent_scorer.polarity_scores(t)["neu"] for t in texts ]) / len(texts)) dict_sentiment["yelp_sent_neg"].append( sum([ sent_scorer.polarity_scores(t)["neg"] for t in texts ]) / len(texts)) dict_sentiment["yelp_sent_comp"].append( sum([ sent_scorer.polarity_scores(t)["compound"] for t in texts ]) / len(texts)) dict_sentiment["yelp_rev_avgrating"].append( sum([r["rating"] for r in getresp]) / len(getresp)) except: dict_sentiment["place_id"].append(_ids["place_id"]) dict_sentiment["yelp_id"].append(_ids["yelp_id"]) [ dict_sentiment[key].append(None) for key in list(dict_sentiment.keys())[2:] ] else: dict_sentiment["place_id"].append(_ids["place_id"]) [ dict_sentiment[key].append(None) for key in list(dict_sentiment.keys())[1:] ] return dict_sentiment
import nltk, glob, csv, re from vaderSentiment import vaderSentiment lexicons = "data/sentiment_analysis/lexicons/*.tsv" dataset = "data/sentiment_analysis/tweets/*.tsv" ps = nltk.stem.PorterStemmer() stop_words = set(nltk.corpus.stopwords.words('english')) vader_analyzer = vaderSentiment.SentimentIntensityAnalyzer() def pretprocess(tweet): tweet = re.sub(r'@\w+', '', tweet) # remove mentions tweet = re.sub(r'@(\s+)\w+', '', tweet) tweet = re.sub(r'http\S+', '', tweet) # remove links tweet = re.sub(r'\w*\\.\w*', '', tweet) tweet = re.sub(r'/\w*', '', tweet) tweet = re.sub(r'([^\s\w]|_)+', '', tweet) # only alfanumeric and space tweet = re.sub(r'\W*\b\w{18,60}\b', '', tweet) # remove big words tokenize_tweet = nltk.word_tokenize(tweet) tweet = [word for word in tokenize_tweet if word not in stop_words] # stop tweet = [ps.stem(word) for word in tweet] # stem tweet = [word for word in tweet if len(word) > 2] # small words return tweet def check_sentiment_per_lexicon(words, lexicon_path): sent = 0 with open(lexicon_path, 'r') as csvfile: rows = [row for row in csv.reader(csvfile, delimiter='\t')] for word in words:
def readData(filename, rnn_filename): data = [] input = [] output = [] ids = [] #sentiment = [] with open(filename, 'r', encoding="utf-8") as csvfile: csvdata = csv.reader(csvfile, delimiter=',', quotechar='"') next(csvdata) analyzer = vaderSentiment.SentimentIntensityAnalyzer() for line in csvdata: d = [] i = [] ids.append(line[0]) i.append(line[1]) sentiment = analyzer.polarity_scores(line[1]) d.append(sentiment['neg']) d.append(sentiment['neu']) d.append(sentiment['pos']) #d.append(TextBlob(line[1]).sentiment) i.append(line[2]) sentiment = analyzer.polarity_scores(line[2]) d.append(sentiment['neg']) d.append(sentiment['neu']) d.append(sentiment['pos']) #d.append(TextBlob(line[2]).sentiment) i.append(line[3]) sentiment = analyzer.polarity_scores(line[3]) d.append(sentiment['neg']) d.append(sentiment['neu']) d.append(sentiment['pos']) #d.append(TextBlob(line[3]).sentiment) i.append(line[4]) sentiment = analyzer.polarity_scores(line[4]) d.append(sentiment['neg']) d.append(sentiment['neu']) d.append(sentiment['pos']) #d.append(TextBlob(line[4]).sentiment) i.append(line[5]) sentiment = analyzer.polarity_scores(line[5]) d.append(sentiment['neg']) d.append(sentiment['neu']) d.append(sentiment['pos']) #d.append(TextBlob(line[5]).sentiment) i.append(line[6]) sentiment = analyzer.polarity_scores(line[6]) d.append(sentiment['neg']) d.append(sentiment['neu']) d.append(sentiment['pos']) #d.append(TextBlob(line[6]).sentiment) input.append(i) data.append(d) e1 = line[5].split(' ') d.append(len(e1)) e2 = line[6].split(' ') d.append(len(e2)) if (filename != "test.csv"): output.append(line[7]) # add RNN features with open(rnn_filename, 'r', encoding="utf-8") as f: csvdata = csv.reader(f, delimiter=',', quotechar='"') count = 0 index = 0 for line in csvdata: if count % 2 == 0: if rnn_filename == "predtest.csv": data[index].append(float(line[0])) else: data[index].append(float(line[2])) else: if rnn_filename == "predtest.csv": data[index].append(float(line[0])) else: data[index].append(float(line[2])) index += 1 count += 1 return data, output, ids
# Set up AWS Database for storage HOST = "hedgedb.c288vca6ravj.us-east-2.rds.amazonaws.com" PORT = 3306 DB_NAME = "scores_timeseries" DB_USER = "******" DB_PW = "bluefootedboobie123" AWS_RDS = dataset.connect("mysql+pymysql://{}:{}@{}/{}".format\ (DB_USER, DB_PW, HOST, DB_NAME)) db = dataset.connect("sqlite:///tweetbase.db") # connect Dataset to Tweetbase db2 = dataset.connect("sqlite:///scorebase.db") printer = pprint.PrettyPrinter() # printer object SIA = sia.SentimentIntensityAnalyzer() # VADER Senitiment object # Twitter Keys CONSUMER_KEY = 'zQuVUVHVWNZd7yfMNdyXx4NgJ' CONSUMER_SECRET = 'OBMTSJfy4UHuCDSslKzZdcgcm33NChTh1m3dJLX5OhRVY5EhUc' AXS_TOKEN_KEY = '1005588267297853441-aYFOthzthNUwgHUvMJNDCcAMn0IfsC' AXS_TOKEN_SECRET = 'e88p7236E3nrigW1pkvmyA6hUyUWrMDQd2D7ZThbnZvoQ' # python-witter API Object TWY = Twython(app_key=CONSUMER_KEY, app_secret=CONSUMER_SECRET, oauth_token=AXS_TOKEN_KEY, \ oauth_token_secret=AXS_TOKEN_SECRET) # tweepy object auth = tweepy.OAuthHandler(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET) auth.set_access_token(key=AXS_TOKEN_KEY, secret=AXS_TOKEN_SECRET)
from __future__ import print_function import json import time import boto3 import vaderSentiment.vaderSentiment as vader print('Loading function') analyzer = vader.SentimentIntensityAnalyzer() #This method calls lambda_handler2 def call_lambda_handler2(data): client = boto3.client("lambda") response = client.invoke( FunctionName='lambda_handler2', InvocationType='RequestResponse', Payload=json.dumps(data), ) res = response['Payload'] return json.load(res) # This methods takes a tweet as argument an returns its sentiment def get_sentiment(tweet): scores = analyzer.polarity_scores(tweet['text']) compound = scores['compound'] if compound < -0.05: sentiment = "Negative" score = scores["neg"]
#import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib.dates as mtd #from newsapi import newsapi_client import vaderSentiment.vaderSentiment as sia import IEX_API_Client as IEX_Client # set up IEX_API_Client IEX = IEX_Client.IEX_API_Client() # set up NewsAPI #NEWS_API = newsapi_client.NewsApiClient(api_key='a76f5e16666f4e66aa4514ea27d425d9') # set up sentiment analyzer VADER SIA = sia.SentimentIntensityAnalyzer() ###-------------------- IEX Methods -------------------### def iex_format_data(symbol, data): """ INPUT decoded JSON data from IEX news archive. Output of "get_news_data". RETURN {symbol : [(article text, polarity score, source, datetime)]} for SINGLE stock symbol """ results_dict = {} results_dict[symbol] = [] for article_dict in data: # combine headline and summary, get polarity score
consumer_secret = lines[1].rstrip() #sets up tweepy api object auth = tweepy.AppAuthHandler(consumer_key, consumer_secret) api = tweepy.API(auth) geoList = { "UK_GEO": "54.364115800619615,-3.7233340937396093,505km", #lat,long and radius of usa, using googlemaps measurerer "USA_GEO": "54.19653024080003,-98.03399875931424,2500km", "AUS_GEO": "-27.606338814377246,135.2637427077579,2000km", "JAMAICA_GEO": "18.13788732831686,-77.24297002881919,128km", "NEWZEALAND_GEO": "-41.876952864666166,173.64569158277476,854km", "INDIA_GEO": "20.09904976266362,79.35616263260019,1300km", "NIGERIA_GEO": "8.574482569303768,7.542770727835754,500km" } analyzer = vader.SentimentIntensityAnalyzer() #Analyzer object from vader def starter(input, country1, country2): country_1_tweets = tweepy.Cursor(api.search, q=str(input), geocode=geoList[country1], lang='en', tweet_mode='extended').items(200) country_2_tweets = tweepy.Cursor(api.search, q=str(input), geocode=geoList[country2], lang='en', tweet_mode='extended').items(200) country_1_tweets = list(map(FullTextHandler, country_1_tweets))
def readData(filename, rnn_filename): data = [] input = [] output = [] ids = [] #sentiment = [] with open(filename, 'r', encoding="utf-8") as csvfile: csvdata = csv.reader(csvfile, delimiter=',', quotechar='"') next(csvdata) analyzer = vaderSentiment.SentimentIntensityAnalyzer() for line in csvdata: d = [] i = [] ids.append(line[0]) # Sentiment Features # i.append(line[1]) # sentiment = analyzer.polarity_scores(line[1]) # d.append(sentiment['neg']) # d.append(sentiment['neu']) # d.append(sentiment['pos']) # i.append(line[2]) # sentiment = analyzer.polarity_scores(line[2]) # d.append(sentiment['neg']) # d.append(sentiment['neu']) # d.append(sentiment['pos']) # i.append(line[3]) # sentiment = analyzer.polarity_scores(line[3]) # d.append(sentiment['neg']) # d.append(sentiment['neu']) # d.append(sentiment['pos']) # i.append(line[4]) # sentiment = analyzer.polarity_scores(line[4]) # d.append(sentiment['neg']) # d.append(sentiment['neu']) # d.append(sentiment['pos']) i.append(line[5]) sentiment = analyzer.polarity_scores(line[5]) d.append(sentiment['neg']) d.append(sentiment['neu']) d.append(sentiment['pos']) i.append(line[6]) sentiment = analyzer.polarity_scores(line[6]) d.append(sentiment['neg']) d.append(sentiment['neu']) d.append(sentiment['pos']) #avg WE # d.append(diffAvgWE(line[1] + " " + line[2] + " " + line[3] + " " + line[4],line[5])) # d.append(diffAvgWE(line[1] + " " + line[2] + " " + line[3] + " " + line[4],line[6])) e1 = diffAvgWE(line[5]) for e in e1: d.append(e) e2 = diffAvgWE(line[6]) for e in e2: d.append(e) # Ngram features #d.append(getBigramCount(line[5],line[6])) #Length of endings e1 = line[5].split(' ') d.append(len(e1)) e2 = line[6].split(' ') d.append(len(e2)) #Cosine Similarity d.append(cosineSim(line[1] + line[2] + line[3] + line[4], line[5])) d.append(cosineSim(line[1] + line[2] + line[3] + line[4], line[6])) d.append( get_jaccard_sim(line[1] + line[2] + line[3] + line[4], line[5])) d.append( get_jaccard_sim(line[1] + line[2] + line[3] + line[4], line[6])) data.append(d) input.append(i) if (filename != "test.csv"): output.append(line[7]) # add RNN features with open(rnn_filename, 'r', encoding="utf-8") as f: csvdata = csv.reader(f, delimiter=',', quotechar='"') count = 0 index = 0 for line in csvdata: if count % 2 == 0: if rnn_filename == "predtest.csv": data[index].append(float(line[0])) else: data[index].append(float(line[2])) else: if rnn_filename == "predtest.csv": data[index].append(float(line[0])) else: data[index].append(float(line[2])) index += 1 count += 1 print(data[10]) return data, output, ids