def get_score(candidate,sent_weight,twitter_positions,fb_positions,official_positions,nyt_positions): """ " Get position similarity score between candidate and user. Computed " using cosine similarity. " " Args: " candidate: candidate name " twitter_positions: Elasticsearch results from twitter index " fb_positions: Elasticsearch results from fb index " official_positions: Elasticsearch results from official index " nyt_positions: Elasticsearch results form NYT index " Returns: " Similarity score between candidate and user " " Author: Kersing Huang <*****@*****.**>, Matthew Garber """ from nltk.sentiment.vader import SentimentIntensityAnalyzer text = [] by_candidate = lambda x: x['candidate'] == candidate and len(x['result']['hits']['hits']) > 0 t = filter(by_candidate,twitter_positions) if len(t) > 0: text.append(t[0]['result']['hits']['hits'][0]['_source']['sm_text']) #end if fb = filter(by_candidate,fb_positions) if len(fb) > 0: text.append(fb[0]['result']['hits']['hits'][0]['_source']['sm_text']) #end if sia = SentimentIntensityAnalyzer() # get candidate polarity scores candidate_scores = sia.polarity_scores("".join(text)) # get user polarity scores user_input = json.loads(web.data()) user_scores = sia.polarity_scores(user_input['position']) # compute cosine similarity of these polarity scores u_len = vector_length(user_scores) c_len = vector_length(candidate_scores) sentiment_score = vector_dot(candidate_scores,user_scores)/(u_len*c_len) official = filter(by_candidate,official_positions) nyt = filter(by_candidate,nyt_positions) relevance_score_sum = 0 for source in [t, fb, official, nyt]: if source: relevance_score_sum += source[0]['score'] relevance_score = relevance_score_sum/4 weighted_score = (sent_weight * sentiment_score) + ((1 - sent_weight) * relevance_score) return weighted_score
def nltk_sentiment(tweets): sentiment = [] sid = SentimentIntensityAnalyzer() for tweet in tweets: st = sid.polarity_scores(tweet) sentiment.append(st['compound']) return sentiment
def analyze_sentiment_vader_lexicon(review, threshold=0.1, verbose=False): # pre-process text review = normalize_accented_characters(review) review = html_parser.unescape(review) review = strip_html(review) # analyze the sentiment for review analyzer = SentimentIntensityAnalyzer() scores = analyzer.polarity_scores(review) # get aggregate scores and final sentiment agg_score = scores['compound'] final_sentiment = 'positive' if agg_score >= threshold\ else 'negative' if verbose: # display detailed sentiment statistics positive = str(round(scores['pos'], 2)*100)+'%' final = round(agg_score, 2) negative = str(round(scores['neg'], 2)*100)+'%' neutral = str(round(scores['neu'], 2)*100)+'%' sentiment_frame = pd.DataFrame([[final_sentiment, final, positive, negative, neutral]], columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], ['Predicted Sentiment', 'Polarity Score', 'Positive', 'Negative', 'Neutral']], labels=[[0,0,0,0,0],[0,1,2,3,4]])) print sentiment_frame return final_sentiment
def add_sentiment(self): print 'Adding sentiment...', sia = SentimentIntensityAnalyzer() for sentiment in ('pos', 'neg', 'neu', 'compound'): sentify = lambda s: sia.polarity_scores(s[:200])[sentiment] self.df['sentiment_' + sentiment] = self.df['story body'].apply(sentify) print 'done'
def main(): parser = argparse.ArgumentParser(description="Reads in output from " + "downloadGroupmeMessages and runs a sentiment analysis") parser.add_argument("inFile", help="The file containing the stored messages") parser.add_argument("--outFile", default="out.txt", help="Results go here") args = parser.parse_args() print("\nThis program prints the most negative and positive users of the chat ranked according to their average score from the VADER sentiment intensity analyzer in the NLTK. Not super accurate, but it's a fun conversation starter") print("The program takes a few seconds to run, and requires that you have some of the NLTK corpora installed.") with open(args.inFile, 'r') as infile: infile.readline() analyzer = SentimentIntensityAnalyzer() negList = [] positiveList = [] counter = PostSentimentCounter() for line in infile: line = line.split('\t') message = line[3] id = line[0] name = line[1] sentDict = analyzer.polarity_scores(message) counter.countPost(id, name, sentDict) counter.printSentimentLeaderboards()
def analyze(posts): post_json = setup_json() #for post, replies in posts.iteritems() sid = SentimentIntensityAnalyzer() for key, value in posts.iteritems(): nustring = ' '.join(value[0]).replace("u'", "") ss = sid.polarity_scores(nustring) for k in sorted(ss): if k is "compound": entry = {} entry['name'] = int(ss[k]*len(nustring)) entry['size'] = len(nustring) if ss[k] == 0.0: post_json['children'][1]['children'].append(entry) elif ss[k] < -0.8: post_json['children'][2]['children'][2]['children'].append(entry) elif ss[k] < -0.4: post_json['children'][2]['children'][1]['children'].append(entry) elif ss[k] < -0.0: post_json['children'][2]['children'][0]['children'].append(entry) elif ss[k] < 0.4: post_json['children'][0]['children'][0]['children'].append(entry) elif ss[k] < 0.8: post_json['children'][0]['children'][1]['children'].append(entry) else: post_json['children'][0]['children'][2]['children'].append(entry) return post_json
def sentiment_analytis_text(self,text_insert): text = text_insert token_text = tokenize.sent_tokenize(text) sid = SentimentIntensityAnalyzer() over_all_sentiment = 0 count = 0 for sentence in token_text: score = sid.polarity_scores(sentence) # Create over all sentiment score over_all_sentiment += score.get("compound") # If sentence is not neuteral add to sentence count for average if (score.get("compound") > 0.1): count += 1 # Calculate average sentiment if count > 0: average_sentiment = over_all_sentiment/count else: average_sentiment = over_all_sentiment return average_sentiment
def add_sentiment_to_comments(): sia = SentimentIntensityAnalyzer() for story_comment_list in comments.values(): for comment in story_comment_list: if "text" in comment: comment["sentiment"] = sia.polarity_scores(comment["text"]) print(comment) # here's where to add sentiment using nltk to text
def on_data(self, raw_data): tweet = loads(raw_data) try: text = tweet['text'] if tweet.get('retweeted_status') is None and 'RT @' not in text: if tweet.get('coordinates') is None: # TODO: Check for rate limit. If rate limited, then perform location inference nouns = self._get_nouns(tweet_text=text) # bf = BilateralFriends(user_id=tweet['user']['id'], twitter_api=self.api) # loc_occurrence_count = bf.get_location_occurrence() tweet_nouns = defaultdict(int) for noun in nouns: tweet_nouns[noun] += 1 self.corpus[tweet['user']['id']] = {'id': tweet['user']['id'], 'location': tweet['user']['location'], # 'bilateral_friends_location_occurrences': loc_occurrence_count, 'text_nouns': tweet_nouns} loc_inf = LocationInference(user=self.corpus[tweet['user']['id']], local_words=self.local_words, geo_words=self.geo_words) inferred_location = loc_inf.get_location() print inferred_location print 'Predicted location:', inferred_location[0] tweet['coordinates'] = {'type': 'Point', 'coordinates': [LOCATIONS[inferred_location[0]][1], LOCATIONS[inferred_location[0]][0]]} print tweet['coordinates'] sentiment_analyzer = SentimentIntensityAnalyzer() sentiment_score = sentiment_analyzer.polarity_scores(text=text)['compound'] tweet['sentiment'] = sentiment_score current_time_ms = int(round(time() * 1000)) tweet['time_inserted'] = current_time_ms print text, ': ', str(sentiment_score) STREAM_BUFFER.insert(tweet) except KeyError, v: print 'KeyError: ', v
def get_tweets(q, today): r = api.request( "search/tweets", {"q": "%s since:%s" % (q, today), "count": "100", "result_type": "recent", "lang": "en"} ) data = (json.loads(r.text))["statuses"] sid = SentimentIntensityAnalyzer() all_tweets = [] for i in range(0, len(data)): text = data[i]["text"].encode("ascii", "ignore").decode("ascii") if "RT" in text: RT = True else: RT = False others = text.count("@") sent = TextBlob(text) valance = sent.sentiment.polarity NLTK = sid.polarity_scores(text) tweet_data = { "tweetID": data[i]["id"], "created_at": data[i]["created_at"], "text": text, "textblob": valance, "NLTK": NLTK["compound"], "RT": RT, "others": others, } # print(data[i]) all_tweets.append(tweet_data) return all_tweets
def sentiment_by_subreddit(): phrase = urllib.quote(request.form["text"]) year = urllib.quote(request.form["year"]) sid = SentimentIntensityAnalyzer() year_str = str(year) if int(year) > 2014: year_str += "_01" query = '''SELECT subreddit, body, score FROM (SELECT subreddit, body, score, RAND() AS r1 FROM [fh-bigquery:reddit_comments.''' + year_str + '''] WHERE REGEXP_MATCH(body, r'(?i:''' + phrase + ''')') AND subreddit IN (SELECT subreddit FROM (SELECT subreddit, count(*) AS c1 FROM [fh-bigquery:reddit_comments.''' + year_str + '''] WHERE REGEXP_MATCH(body, r'(?i:'''+phrase+''')') AND score > 1 GROUP BY subreddit ORDER BY c1 DESC LIMIT 10)) ORDER BY r1 LIMIT 5000) ''' bigquery_service = build('bigquery', 'v2', credentials=credentials) try: query_request = bigquery_service.jobs() query_data = { 'query': query, 'timeoutMs': 30000 } query_response = query_request.query( projectId=bigquery_pid, body=query_data).execute() except HttpError as err: print('Error: {}'.format(err.content)) raise err subreddit_sentiments = defaultdict(list) subreddit_total = defaultdict(int) if 'rows' in query_response: rows = query_response['rows'] sentiments = [] for row in rows: subreddit = row['f'][0]['v'] body = row['f'][1]['v'] score = int(row['f'][2]['v']) sentiment_values = [] lines_list = tokenize.sent_tokenize(body) for sentence in lines_list: if phrase.upper() in sentence.upper():#(regex.search(sentence)): s = sid.polarity_scores(sentence) sentiment_values.append(s['compound']) comment_sentiment = float(sum(sentiment_values))/len(sentiment_values) subreddit_sentiments[subreddit].append((comment_sentiment, score)) subreddit_total[subreddit] += int(score) subreddit_sentiments = {subreddit:1 + float(sum([float(pair[0])*float(pair[1]) for pair in sentiment_list]))/subreddit_total[subreddit] for subreddit, sentiment_list in subreddit_sentiments.items()} result = sorted(subreddit_sentiments.items(), key = lambda(k,v): (-v,k)) return json.dumps(result)
def get_unique_tweets(self, data_dict): # TODO: Implement filter to check if Tweet text starts with 'RT' """ :param data_dict: :return: """ flag = False try: text = data_dict['text'].encode('ascii', 'ignore').lower() # Check for 'retweeted_status' in metadata field to determine # if tweet is a retweet (1st check) if 'retweeted_status' not in data_dict: url_match = URL.match(text) # Check if link contains url if url_match: match_group = url_match.group() if len(self.key_list) > 0: if any(match_group in item for item in self.key_list): flag = True if flag is False: data_dict['text'] = match_group print "Inserted text: " + data_dict['text'] + '\n' self.key_list.append(match_group) sid = SentimentIntensityAnalyzer() ss = sid.polarity_scores(text) print ss['compound'] score = ss['compound'] if score < 0: score += (3 * score) for w in GOOGLE: if w in text and self.google_price >= 0: self.google_price = score self.google_text = text for w in MICROSOFT: if w in text and self.microsoft_price >= 0: self.microsoft_price = score self.microsoft_text = text for w in FACEBOOK: if w in text and self.facebook_price >= 0: self.facebook_price = score self.facebook_text = text p.trigger('test_channel', 'my_event', {'google': self.google_price, 'microsoft': self.microsoft_price, 'facebook': self.facebook_price}) p.trigger('tweet_channel', 'my_event', { 'google_text': self.google_text, 'microsoft_text': self.microsoft_text, 'facebook_text' : self.facebook_text }) self.google_price = 0 self.microsoft_price = 0 self.facebook_price = 0 else: self.key_list.append(url_match.group()) except TypeError, e: print >> sys.stderr, e self.log_error(str(e))
def avg_message_sentiment_helper(self, message): sentences = tokenize.sent_tokenize(message) sid = SentimentIntensityAnalyzer() sentence_sentiments = [] for sentence in sentences: ss = sid.polarity_scores(sentence) sentence_sentiments.append(ss['compound']) return np.mean(sentence_sentiments)
def vader(self): sid = SentimentIntensityAnalyzer() results = {'neg': 0.0, 'pos': 0.0, 'neu': 0.0, 'compound': 0.0} ss = sid.polarity_scores(self.text) for k in sorted(ss): results[k] += ss[k] return results
class Vader: def __init__(self): self.analyzer = SentimentIntensityAnalyzer() def __call__(self, text): scores = self.analyzer.polarity_scores(text) return scores['pos'] - scores['neg']
def sentimentScore(sentences): analyzer = SentimentIntensityAnalyzer() results = [] for sentence in sentences: vs = analyzer.polarity_scores(sentence) print("vs: " + str(vs)) results.append(vs) return results
def get_mean_sentiment(self, exclude_neutral=True): sid = SentimentIntensityAnalyzer() tot_score = 0 if exclude_neutral: message_count = 0 for sentence in self.get_message_lst(): ss = sid.polarity_scores(sentence) if ss['compound'] != 0: tot_score += ss['compound'] message_count += 1 else: for sentence in self.get_message_lst(): ss = sid.polarity_scores(sentence) tot_score += ss['compound'] message_count = len(self.get_message_lst()) return tot_score / message_count
def computeVaderScore(self,sentence): sid = SentimentIntensityAnalyzer() ss = sid.polarity_scores(sentence) retList = [] for k in sorted(ss): retList.append(ss[k]) return retList
def negative_msg_count(self): sid = SentimentIntensityAnalyzer() msg_count = 0 for sentence in self.get_message_lst(): ss = sid.polarity_scores(sentence) if ss['compound'] < 0: msg_count += 1 return msg_count
def get_sentiment_score(sentence): score_dict = {} sid = SentimentIntensityAnalyzer() ss = sid.polarity_scores(sentence) for k in sorted(ss): # print('{0}: {1}, '.format(k, ss[k]), end='') score_dict[k] = ss[k] return score_dict
def sentiment(sentence): sid = SentimentIntensityAnalyzer() ss = sid.polarity_scores(sentence) if float(ss['neg']) > float(ss['pos']) : return -1*float(ss['neg']) elif float(ss['neg']) < float(ss['pos']): return float(ss['pos']) else: return 0
def process(self, tup): # extract the sentence sentence = tup.values[0] sid = SentimentIntensityAnalyzer() ss = sid.polarity_scores(sentence) tuple_result = (str(ss['neg']),str(ss['pos']),str(ss['neu'])) self.emit(tuple_result)
def getSentiments(sentences): from nltk import tokenize sid = SentimentIntensityAnalyzer() sentimtents = [None] * len(sentences) i = 0 for sentence in sentences: ss = sid.polarity_scores(sentence) sentimtents[i] = ss["compound"] i = i + 1 return sentimtents
def vader_sentiment_scores(text_array): sid = SentimentIntensityAnalyzer() assert all([type(t) == type('') for t in text_array]) vs_dict = {'neg': [], 'neu': [], 'pos': [], 'compound': []} for i, text in enumerate(text_array): if i % 10000 == 0: print(i) vs = sid.polarity_scores(text) for key, value in vs.items(): vs_dict[key].append(value) return vs_dict
def personAvgSentiment(person, month = None, year = None): from nltk.sentiment.vader import SentimentIntensityAnalyzer comp = 0 sid = SentimentIntensityAnalyzer() if month: msgLst = fullMessageListMonth(person, month, year) else: msgLst = fullMessageList(person) for message in msgLst: sentimentDict = sid.polarity_scores(message) comp += sentimentDict['compound'] return comp/len(msgLst) if len(msgLst) != 0 else 0
def sentiAnalyze(): analyzer = SentimentIntensityAnalyzer() for candidate in tweets: for week in tweets[candidate]: for tweet in tweets[candidate][week]: if candidate not in result['sum']: result['sum'][candidate] = {} if 'total' not in result['sum'][candidate]: result['sum'][candidate]['total'] = 0 if week not in result['sum'][candidate]: result['sum'][candidate][week] = 0 result['sum'][candidate][week] += 1 result['sum'][candidate]['total'] += 1 score = analyzer.polarity_scores(tweet[2]) if (score['pos'] - score['neg']) > DISTINCT_THRESHOLD: if candidate not in pos_tweets: pos_tweets[candidate] = {} if week not in pos_tweets[candidate]: pos_tweets[candidate][week] = [] pos_tweets[candidate][week].append(tweet) if candidate not in result['pos']: result['pos'][candidate] = {} if 'total' not in result['pos'][candidate]: result['pos'][candidate]['total'] = 0 if week not in result['pos'][candidate]: result['pos'][candidate][week] = 0 result['pos'][candidate][week] += 1 result['pos'][candidate]['total'] += 1 if (score['neg'] - score['pos']) > DISTINCT_THRESHOLD: if candidate not in neg_tweets: neg_tweets[candidate] = {} if week not in neg_tweets[candidate]: neg_tweets[candidate][week] = [] neg_tweets[candidate][week].append(tweet) if candidate not in result['neg']: result['neg'][candidate] = {} if 'total' not in result['neg'][candidate]: result['neg'][candidate]['total'] = 0 if week not in result['neg'][candidate]: result['neg'][candidate][week] = 0 result['neg'][candidate][week] += 1 result['neg'][candidate]['total'] += 1 return
def vader_sentiment(df): sith = SentimentIntensityAnalyzer() sentiment = [] for sentence in df.Message: sent = sith.polarity_scores(sentence) #sent_total = sent['pos'] - sent['neg'] sentiment.append(sent['compound']) df['sentiment'] = sentiment return df
def ProcessReviews(df, ptype): parse_type = ptype # Divide reviews into individual sentences sentences = df['text'].apply(tokenizetext) # Stick the sentences back into the dataframe df['sentlist'] = sentences d1, d2, d3 = [], [], [] d4, d5, d6 = [], [], [] # Initialize the sentiment vader analyzer sid = SentimentIntensityAnalyzer() # Loop over sentences and process them for i in range(0, df.shape[0]): sent_list = df['sentlist'][i] for sentence in sent_list: sent_raw = ''.join(sentence) sent_pro = strip_punctuation(sent_raw) sent_pro = rmstopwords(sent_pro) sent_pro = lemmatize(sent_pro) sentiment = sid.polarity_scores(sent_raw)['compound'] if parse_type[0] == 'ngram': pos = ngrams(sent_pro, ptype[1]) elif parse_type == 'chunk': pos = extract_candidate_chunks(sent_pro) elif parse_type == 'rake': pos = rake_object.run(sent_raw) pos = ['_'.join(word[0].split()) for word in pos] for j in pos: d1.append(df['date'][i]) d2.append(df['location'][i]) d3.append(df['rating'][i]) d4.append(j), d5.append(sentiment) d6.append(sent_raw) # Put everything in a dataframe processed_df = pd.DataFrame() processed_df['date'] = d1 processed_df['location'] = d2 processed_df['rating'] = d3 processed_df['aspects'] = d4 processed_df['sentiment'] = d5 processed_df['context'] = d6 # Remove any entry where the sentence # was determined to be neutral processed_df = processed_df[(processed_df['sentiment'] != 0)] return processed_df
def get_news_sentiment(self,team): print '------------------------------' print 'Scanning sentiment for: ',team print '------------------------------' story_limit=5 neg=0 pos=0 visible_text='' base_url='https://www.google.co.uk/search?hl=en&gl=uk&tbm=nws&authuser=0&q=football+european+championships'+team.replace(' ','%20') req = urllib2.Request(base_url,headers = {'User-Agent': 'Mozilla/5.0'} ) print base_url page = urllib2.urlopen(req) soup = BeautifulSoup(page) # remove scripts and tags for script in soup(["script", "style"]): script.extract() # rip it out # find the links sentence_count=0 story_count=0 for a in soup.findAll('a'): if story_count<story_limit: # ignore internal google links and remove the tracking guff from the end of the URL if ('google' not in a.attrs['href'] and '/search?q=football+european+championships' not in a.attrs['href'] and 'http://' in a.attrs['href']): print a.attrs['href'].replace('/url?q=','').split('&')[0] story_req=urllib2.Request(a.attrs['href'].replace('/url?q=','').split('&')[0],headers={'User-Agent': 'Mozilla/5.0'}) try: story_page=urllib2.urlopen(story_req,timeout=5) story_soup=BeautifulSoup(story_page) # analyse text visible_text = story_soup.getText() neg=0 pos=0 sentences = tokenize.sent_tokenize(visible_text) sid = SentimentIntensityAnalyzer() # for each sentence in the story, get the sentiment/polarity for sentence in sentences: ss = sid.polarity_scores(sentence) neg=neg+ss['neg'] pos=pos+ss['pos'] sentence_count=sentence_count+ len(sentences) score=score+pos-neg # print out a story by story sentiment for logging print 'Sentiment: ',(pos-neg)/len(sentences) except socket.timeout as e: print type(e) except urllib2.HTTPError: print 'failed on url: ',a.attrs['href'] story_count=story_count+1 # return the average net polarity/sentiment return (pos-neg)/sentence_count
def clean_df(user_df): user_df['simple_text'] = user_df['text'].apply(lambda x: remove_links(x)) #A warning will appear if Twython is not installed sid = SentimentIntensityAnalyzer() user_df['sentiment'] = user_df['text'].apply(lambda x: sid.polarity_scores(x)) #keep only fairly positive tweets pos_user_tweets = user_df[user_df['sentiment'].apply(lambda x: x['neg']) < .45] pos_user_tweets['stemmed_text'] = pos_user_tweets['simple_text'].apply(lambda tweet: remove_punctuation(tweet)) pos_user_tweets['stripped_text'] = pos_user_tweets['stemmed_text'].apply(lambda tweet: remove_irrelevant_terms(tweet)) return pos_user_tweets
blanks = [] # start with an empty list for i, lb, rv in df.itertuples(): # iterate over the DataFrame if type(rv) == str: # avoid NaN values if rv.isspace(): # test 'review' for whitespace blanks.append(i) # add matching index numbers to the list df.drop(blanks, inplace=True) df['label'].value_counts() from nltk.sentiment.vader import SentimentIntensityAnalyzer sid = SentimentIntensityAnalyzer() df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review)) df['compound'] = df['scores'].apply(lambda score_dict: score_dict['compound']) df['comp_score'] = df['compound'].apply(lambda c: 'pos' if c >= 0 else 'neg') df.head() from sklearn.metrics import accuracy_score, classification_report, confusion_matrix accuracy_score(df['label'], df['comp_score']) print(classification_report(df['label'], df['comp_score'])) print(confusion_matrix(df['label'], df['comp_score']))
len(dnc_doclist) # make sure all tweets are unique rnc_array = np.array(rnc_doclist) len(np.unique(rnc_array)) dnc_array = np.array(dnc_doclist) len(np.unique(dnc_array)) print(rnc_doclist[:1]) dnc_sentiment_list = list() rnc_sentiment_list = list() sid = SentimentIntensityAnalyzer() for tweet in rnc_doclist: ss = sid.polarity_scores(tweet) rnc_sentiment_list.append(ss['compound']) len(rnc_sentiment_list) rnc_sentiment_list[:10] rnc_avg_sentiment = np.mean(np.array(rnc_sentiment_list)) #0.1260588 rnc_stddev_sentiment = np.std(np.array(rnc_sentiment_list)) #0.43161457012311344 plt.hist(rnc_sentiment_list, bins=15, color='red') plt.title("#RNCConvention2020 Tweet Sentiment Scores") plt.xlabel('Sentiment Score') plt.ylabel('Frequency') plt.show() for tweet in dnc_doclist: ss = sid.polarity_scores(tweet)
from nltk.sentiment.vader import SentimentIntensityAnalyzer # Create SentimentIntensityAnalyzer instance sid = SentimentIntensityAnalyzer() # Let's try it on one of our phone calls call_2_text = transcribe_audio('call_2.wav') # Display text and sentiment polarity scores print(call_2_text) print(sid.polarity_scores(call_2_text)) from nltk.sentiment.vader import SentimentIntensityAnalyzer # Create SentimentIntensityAnalyzer instance sid = SentimentIntensityAnalyzer() # Transcribe customer channel of call 2 call_2_channel_2_text = transcribe_audio('call_2_channel_2.wav') # Display text and sentiment polarity scores print(call_2_channel_2_text) print(sid.polarity_scores(call_2_channel_2_text)) # Import sent_tokenize from nltk from nltk import sent_tokenize from nltk.sentiment.vader import SentimentIntensityAnalyzer # Create SentimentIntensityAnalyzer instance sid = SentimentIntensityAnalyzer()
for chapter in chapters[1:]: print("Chapter :", chapter[:2]) #SentimentIntensityAnalyzer() function is used for sentiment analysis implmentation\n" #using NLTK to find positive/ negative meaning of the chapter. sid = SentimentIntensityAnalyzer() #If stopwords found in the chapter, then replace it with blank, so that it can not count \n' #for calculating positive/negative values. for w in chapter.split(" "): if w in stopwords.words('english'): chapter = chapter.replace(w, " ", 1) #Calculating positive, negative and neutral values using polarity_scores() function. ss = sid.polarity_scores(chapter) pos = ss["pos"] neg = ss["neg"] neu = ss["neu"] print("Positive ", pos, "Negative ", neg, "Neutral", neu) #Copying printed values in out which stores list of positive, negative and neutral values list. out.append([pos, neg, neu, line_no]) line_no += 1 N = len(out) # Considering only positive and negative values. p = [i[0] for i in out] n = [i[1] for i in out]
import pandas as pd import nltk from nltk.sentiment.vader import SentimentIntensityAnalyzer nltk.downloader.download('vader_lexicon') file = 'My_Posts.xlsx' xl = pd.ExcelFile(file) dfs = xl.parse(xl.sheet_names[0]) dfs = list(dfs['Your Posts']) print(dfs) sid = SentimentIntensityAnalyzer() str1 = "AM" str2 = "PM" for data in dfs: a = data.find(str1) b = data.find(str2) if a == -1 & b == -1: ss = sid.polarity_scores(data) print(data) for k in ss: print(k, ss[k])
class nltkWrapperClass: ##initiate class d(object): def __init__(self, out_dir): ## private method nltk.download('stopwords') nltk.download('vader_lexicon') self.__out_dir = out_dir ## private attribute self.sid = SentimentIntensityAnalyzer() ## public attribute def __repr__(self): return repr(f'The output direcotry of this class is {self.__out_dir}') ## Public method to count word frequency def wordFreqCount(self, token_list): freq = nltk.FreqDist(token_list) return freq ## Public method to print out word frequency def wordFreqPrint(self, freq): for key, val in freq.items(): print(f'key is: {key}, value is : {val}') ## Public method to plot out word frequency def wordFreqPlot(self, freq): freq.plot(20, cumulative=False) matplotlib.pyplot.show() ## Public method to clean up english stop words def stopwordClean(self, token_list): clean_token = [] for token in token_list: if token not in stopwords.words('english'): clean_token.append(token) return clean_token ## Public method for giving sentiment score for tweet list and return dictionary ## that use tweet id as key and another dictionary that contains the text of tweet ## and it's sentiment score as value. Also there is a verbose flag def sentiScore(self, tweet_list, Verbose): sentiment_score_list = [] Tweet_dict = {} for tweet in tweet_list: individual_tweet_dict = {} individual_tweet_dict['Text'] = tweet.text #Calculating sentiment scores individual_tweet_dict[ 'Sentiment_score'] = self.sid.polarity_scores(tweet.text) Tweet_dict[tweet.id] = individual_tweet_dict if Verbose: print( f'Tweet is : {tweet.text}\nSentiment score is :{self.sid.polarity_scores(tweet.text)}\n\n' ) return Tweet_dict ## Public method to filter positive tweets def posTweets(self, Tweet_dict, Verbose): Pos_Tweet_dict = {} for tweet in Tweet_dict: if Tweet_dict[tweet]['Sentiment_score']['compound'] > 0.05: Pos_Tweet_dict[tweet] = Tweet_dict[tweet] if Verbose: print( f"Positive tweet is : {Tweet_dict[tweet]['Text']}\nSentiment score is :{Tweet_dict[tweet]['Sentiment_score']}\n\n" ) return Pos_Tweet_dict ## Public method to filter negative tweets def negTweets(self, Tweet_dict, Verbose): Neg_Tweet_dict = {} for tweet in Tweet_dict: if Tweet_dict[tweet]['Sentiment_score']['compound'] < -0.05: Neg_Tweet_dict[tweet] = Tweet_dict[tweet] if Verbose: print( f"Negative tweet is : {Tweet_dict[tweet]['Text']}\nSentiment score is :{Tweet_dict[tweet]['Sentiment_score']}\n\n" ) return Neg_Tweet_dict ## Public method to save pos and neg tweet results def CataResultSave(self, Pos_Tweet_dict, Neg_Tweet_dict): pos_result = open(self.__out_dir + "Positive_Tweets_Results.txt", 'w') for tweet in Pos_Tweet_dict: pos_result.write( f"Positive tweet is : {Pos_Tweet_dict[tweet]['Text']}\nSentiment score is :{Pos_Tweet_dict[tweet]['Sentiment_score']}\n\n" ) pos_result.close() neg_result = open(self.__out_dir + "Negative_Tweets_Results.txt", 'w') for tweet in Neg_Tweet_dict: neg_result.write( f"Negative tweet is : {Neg_Tweet_dict[tweet]['Text']}\nSentiment score is :{Neg_Tweet_dict[tweet]['Sentiment_score']}\n\n" ) neg_result.close()
def load_and_clean(self, data_path): # read data set df = pd.read_csv(data_path, sep='\t', header=None) df.columns = [ 'polarity', 'aspect_cat', 'target', 'offsets', 'sentence' ] # create label dictionaries to create column with category number aspect_lab = { "AMBIENCE#GENERAL": 0, "DRINKS#PRICES": 1, "DRINKS#QUALITY": 2, "DRINKS#STYLE_OPTIONS": 3, "FOOD#PRICES": 4, "FOOD#QUALITY": 5, "FOOD#STYLE_OPTIONS": 6, "LOCATION#GENERAL": 7, "RESTAURANT#GENERAL": 8, "RESTAURANT#MISCELLANEOUS": 9, "RESTAURANT#PRICES": 10, "SERVICE#GENERAL": 11 } # do the same thing for polarity label pol_lab = {"positive": 0, "neutral": 1, "negative": 2} # create aspect category number column df.insert(loc=2, column='cat_num', value=df['aspect_cat'].map(aspect_lab)) # create polarity category number column df.insert(loc=1, column='pol_num', value=df['polarity'].map(pol_lab)) # create start and end column for indices of target words in sentence df.insert(loc=6, column='start_end', value=[ df.loc[i, "offsets"].split(':') for i in range(np.shape(df)[0]) ]) df["start"] = [ int(df.loc[i, "start_end"][0]) for i in range(np.shape(df)[0]) ] df["end"] = [ int(df.loc[i, "start_end"][1]) for i in range(np.shape(df)[0]) ] # split aspect categories and create column for each aspect category type df["aspect_cat"] = [ df.loc[i, "aspect_cat"].split("#") for i in range(np.shape(df)[0]) ] # we split the 2 words. df["cat1"] = [ df.loc[i, "aspect_cat"][0] for i in range(np.shape(df)[0]) ] aspect_lab1 = { "AMBIENCE": 0, "DRINKS": 1, "FOOD": 2, "LOCATION": 3, "RESTAURANT": 4, "SERVICE": 5 } df['cat1'] = df['cat1'].map( aspect_lab1 ) # we fill 'cat2' with the numbers corresponding to each category. df["cat2"] = [ df.loc[i, "aspect_cat"][1] for i in range(np.shape(df)[0]) ] aspect_lab2 = { "GENERAL": 0, "PRICES": 1, "QUALITY": 2, "STYLE_OPTIONS": 3, "MISCELLANEOUS": 4 } df['cat2'] = df['cat2'].map( aspect_lab2 ) # we fill 'cat2' with the numbers corresponding to each category. # chop up sentences cut = ["but"] df["sentence_cut"] = df["sentence"] for i in range(np.shape(df)[0]): # for every row of the dataset. for c in cut: df.loc[i, "sentence_cut"] = df.loc[i, "sentence_cut"].replace( c, 'but') # replace all the cut words by 'but'. df["sentence_cut"] = [ df.loc[i, "sentence_cut"].split("but") for i in range(np.shape(df)[0]) ] # split each sentence at the but. list_sent = [] # create a list of all sentences. for i in range(np.shape(df)[0]): # for every row of the dataset. list_sent.append([]) for j in range(len( df.loc[i, "sentence_cut"])): # for each word of the sentence. if df.loc[i, "target"] in df.loc[i, "sentence_cut"][j]: list_sent[i].append(df.loc[i, "sentence_cut"][j]) for i in range(np.shape(df)[0]): list_sent[i] = " ".join(list_sent[i]) # list_sent = sum(list_sent, []) df["list_sent"] = list_sent ### Tokenization: we will generate BOW and ngrams for the whole sentences, then for the dependencies, ### and finally, for the words in the window of size 5 (distance from the target). ### We will also assign sentiment scores to each of these representations of the text. ### We first delete stop words but will test a model keeping them. ### In addition, to word representations, we will generate POS variables # load SpaCy for English nlp = spacy.load('en_core_web_sm') # number of positive words among words which depend of target pos_words = pd.read_excel( '/Users/philippehayat/Desktop/pos_words.xlsx') pos_words = list(pos_words.iloc[:, 0]) # number of negative words among words which depend of target neg_words = pd.read_excel( '/Users/philippehayat/Desktop/neg_words.xlsx') neg_words = list(neg_words.iloc[:, 0]) # create lemmatized list of words from whole sentence as well as list of pos lemma_list = [] for i in range(np.shape(df)[0]): lemma_list.append([]) for token in nlp(df.loc[i, 'list_sent']): if (token.is_punct == False) & (token.is_alpha == True): lemma_list[i].append( token.lemma_ ) # each words of every sentence of 'list_sent' is lemmatized. # number of positive words among all words in sentence pos_score2 = [] for i in range(np.shape(df)[0]): pos_score2.append([]) for j in lemma_list[i]: if j in pos_words: pos_score2[i].append(1) pos_score2 = [ sum(pos_score2[i]) for i in range(len(pos_score2)) ] # we count the number of positive words in each sentence. # number of negative words among all words in sentence neg_score2 = [] for i in range(np.shape(df)[0]): neg_score2.append([]) for j in lemma_list[i]: if j in neg_words: neg_score2[i].append(1) neg_score2 = [ sum(neg_score2[i]) for i in range(len(neg_score2)) ] # we count the number of negative words in each sentence. # create vocab set and POS set for 3 methods of selecting words vocab_all = list(set(sum(lemma_list, []))) #pos_cat_all = list(set(sum(pos_list, []))) # assign to columns df['lemma_list'] = lemma_list #df['pos_list'] = pos_list df['sentence2'] = [ " ".join(df['lemma_list'][i]) for i in range(np.shape(df)[0]) ] #df['pos_list2'] = [" ".join(df['pos_list'][i]) for i in range(np.shape(df)[0])] df['pos_score2'] = pos_score2 df['neg_score2'] = neg_score2 # BOW # BOW for all for i in vocab_all: df['_' + i] = df.sentence2.str.count(i) #for i in pos_cat_all: #df['_pos_' + i] = df.pos_list2.str.count(i) # Vader polarizer variable vader = SentimentIntensityAnalyzer( ) # vader gives a score between -1 and 1 to each sentence (that is why + 1). vader_all = [ vader.polarity_scores(df['sentence2'][i])['compound'] + 1 for i in range(df.shape[0]) ] df['vader_all'] = vader_all return df
sqlContext = SQLContext(sc) review = sqlContext.read.json("./dataset/review.json") #review = review.withColumn("user_id", review.user_id.cast('string')).withColumn("business_id", review.business_id.cast('string')).withColumn("stars", review.stars.cast('float')) #sentiment = coalesce((col("stars") >= 3.0).cast("int"), lit(1)) #review = review.withColumn("sentiment", sentiment) #review = review.filter(review.user_id.isNotNull()).filter(review.business_id.isNotNull()) sentiments = sqlContext.sql( 'SELECT *, case when stars <= 2.5 then 0 when stars >= 3.5 then 1 end as sentiment from review where stars<=2.5 or stars>= 3.5' ) df1 = sentiments[['review_id', 'text', 'sentiment']] #df1.show(1) #df1.count() #print(df1) df2 = df1.rdd #df2.take(1) sid = SentimentIntensityAnalyzer() #calculate the compound score for the review text and then use that to derive the sentiment. #Then use this predicted sentiment and compare it with the actual sentiment to calculate accuracy. df3 = df2.map(lambda x : (x[0],sid.polarity_scores(x[1])['compound'],x[2])).\ map(lambda y: (y[0],1,y[2]) if y[1]>0 else (y[0],0,y[2])).\ map(lambda z: 1 if(z[1] == z[2]) else 0) #df3.count() #df3.take(2) total_count = df3.count() accuracy = (df3.filter(lambda a: a == 1).count() * 1.0) / total_count print(accuracy)
#------------------------ #Getting scores of sentiments #------------------------ # Scores of YouTube data- data = pd.read_csv('youtube_data2_for_.csv') compund = [] pos = [] neg = [] neu = [] for i in data.reviews_youtube: i = i.replace("\'", '') i = i.strip() #print(get_textBlob_score(i)) ss = sid.polarity_scores(i) compund.append(ss['compound']) pos.append(ss['pos']) neg.append(ss['neg']) neu.append(ss['neu']) #print(ss) data['compund'] = compund data['pos'] = pos data['neg'] = neg data['neu'] = neu #get_vader_score('bad') maxValuesObj = data[['pos', 'neg']].idxmax(axis=1) data['sentiments'] = maxValuesObj data.to_csv('/Users/anilvyas/Desktop/Audace Labs/Rated_data/youtube_data.csv')
import pandas as pd from nltk.sentiment.vader import SentimentIntensityAnalyzer # contains titles and release years associated with each ID movie_titles = pd.read_csv('movie_titles.txt', names=['col'], engine='python', sep='\t') movie_titles = pd.DataFrame(movie_titles.col.str.split(',', 2).tolist(), columns=['ID', 'Year', 'Name']) # assign a sentiment score (-1 to +1) for each movie title sid = SentimentIntensityAnalyzer() movie_titles["Sentiment"] = "" i = 0 for index, row in movie_titles.iterrows(): title = row["Name"] score = sid.polarity_scores(title)['compound'] row["Sentiment"] = score print(i) i = i + 1 # export to csv movie_titles.to_csv('temp.csv')
accs_model2 = cross_val_score(logistic_regression, Xt, Y, scoring="accuracy", cv=5) print("Accuracy of classifier two is " + str(round(np.mean(accs_model2), 3))) #----------------------------------------------------------------------------------------------------- # Model Three: use sentimentIntensityAnalyzer and DecisionTree to analysis the review text from nltk.sentiment.vader import SentimentIntensityAnalyzer from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score # use Sentiment Intensity Analyzer to get polarity scores for each review text reviews=df["Review Text"] sentiment=[] s=SentimentIntensityAnalyzer() for x in reviews: score=s.polarity_scores(x) sentiment.append(score) df_s=pd.DataFrame(sentiment) # add palraity scores into original data frame df["pos_score"]=df_s["pos"] df["neu_score"]=df_s["neu"] df["neg_score"]=df_s["neg"] df["comp_score"]=df_s["compound"] predictors=["pos_score","neu_score","neg_score","comp_score"] target="Rating_fresh" cleaned_df = df.dropna() # build the regression classifier
sources='bbc-news,the-verge', domains='bbc.co.uk,techcrunch.com', # from_parameter=two_years_ago, # to=current_date, language='en', sort_by='relevancy', page=2) articles = newsdata['articles'] article_dic = {} articles_arr = [] for article in articles: headline = article['title'] article_dic['headline'] = headline # format of score is {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} article_dic['headline_score'] = sia.polarity_scores(headline) description = article['description'] article_dic['description'] = description article_dic['description_score'] = sia.polarity_scores(description) # format of date is 2018-04-13T00:46:59Z (UTC format) article_dic['publishedAt'] = article['publishedAt'] article_dic['source'] = article['source']['name'] comp_symb = company_symb[company].replace(".", "_").replace("-", "_") article_dict['stock_price_change'] = calculate_stock_price_change(comp_symb,article_dic['publishedAt']) # append article dict to array articles_arr.append(article_dic)
def move_NASDAQ(): test_start_date = datetime.datetime.now() - datetime.timedelta(days=5) test_end_date = datetime.datetime.now() #- datetime.timedelta(days = 1) df_stocks = pd.read_pickle('pickled_para_NDAQ.pkl') df_stocks['prices'] = df_stocks['close'].apply(np.int64) # selecting the prices and articles df_stocks = df_stocks[['prices', 'articles']] df_stocks['articles'] = df_stocks['articles'].map(lambda x: x.lstrip('.-')) df_stocks df = df_stocks[['prices']].copy() # Adding new columns to the data frame df["compound"] = '' df["neg"] = '' df["neu"] = '' df["pos"] = '' sid = SentimentIntensityAnalyzer() for date, row in df_stocks.T.iteritems(): try: sentence = unicodedata.normalize('NFKD', df_stocks.loc[date, 'articles']) ss = sid.polarity_scores(sentence) df.set_value(date, 'compound', ss['compound']) df.set_value(date, 'neg', ss['neg']) df.set_value(date, 'neu', ss['neu']) df.set_value(date, 'pos', ss['pos']) except TypeError: print(df_stocks.loc[date, 'articles']) print(date) test = df.ix[test_start_date:test_end_date] ## #test_start_date = '2018-03-09' #test_end_date = '2018-03-10' test = df.ix[test_start_date:test_end_date] ## # Calculating the sentiment score sentiment_score_list = [] for date, row in test.T.iteritems(): sentiment_score = np.asarray([ df.loc[date, 'compound'], df.loc[date, 'neg'], df.loc[date, 'neu'], df.loc[date, 'pos'] ]) #sentiment_score = np.asarray([df.loc[date, 'neg'],df.loc[date, 'pos']]) sentiment_score_list.append(sentiment_score) numpy_df_test = np.asarray(sentiment_score_list) filename = 'finalized_model_NDAQ.sav' loaded_model = pickle.load(open(filename, 'rb')) #loaded_model.fit(numpy_df_test,test['prices']) result = loaded_model.predict(numpy_df_test) difference = result[1] - result[0] if (difference > 0): sentence = "Stock Price will rise" else: sentence = "Stock Price will fall" param = { 'q': ".IXIC", # Stock symbol (ex: "AAPL") 'i': "86400", # Interval size in seconds ("86400" = 1 day intervals) 'x': "INDEXNASDAQ", # Stock exchange symbol on which stock is traded (ex: "NASD") 'p': "1M" # Period (Ex: "1Y" = 1 year) } df = get_price_data(param) df.to_csv('C:/Users/ansha/Anaconda3/FlaskApp/nasd1.csv') line = pd.read_csv("nasd1.csv", index_col=False) df = pd.DataFrame(data=line) dates = df['Unnamed: 0'] o = df['Open'] h = df['High'] l = df['Low'] c = df['Close'] line_chart = pygal.Line(x_label_rotation=20, x_labels_major_every=3, show_minor_x_labels=False, human_readable=True) line_chart.title = 'NASDAQ' line_chart.x_labels = map(str, dates) line_chart.add('Open', o) line_chart.add('High', h) line_chart.add('Low', l) line_chart.add('Close', c) graph_data = line_chart.render_data_uri() return render_template("an1.html", data=sentence, graph_data=graph_data, data1=difference)
while (True): start_time = time.time() docs = src_coll.find({"nlp_flag": 1}).limit(1000) print("start processing 1000 items") for doc in docs: print("processing... {}".format(doc['search_word'])) db.usa_tweets_collection.update_one({'_id': doc['_id']}, {'$set': { 'nlp_flag': 2 }}) ss = sid.polarity_scores(doc['mention']) neu_score = ss['neu'] neg_score = ss['neg'] pos_score = ss['pos'] update_q = { '$inc': { 'neu': neu_score, 'pos': pos_score, 'neg': neg_score } } # best_score = max(pos_score, neg_score, neu_score) # if neu_score == best_score: # update_q = {'$inc': {'neu': 1}}
def tweet_data(): try: # Open/Create a file to append data # If the file exists, then read the existing data from the CSV file. file_name = "tourism_" + datetime.now().strftime( "%d-%b-%Y") + "_data.csv" COLS = [ 'created_at', 'id', 'send_by', 'tweet_url', 'original_text', 'trans', 'process', 'priority', 'type' ] if os.path.exists(file_name): df = pd.read_csv(file_name, header=0) pre_id = max(df["id"]) print(pre_id) else: pre_id = 0 df = pd.DataFrame(columns=COLS) print(pre_id) hndlr_lst = twitter_credential.handler_list # new_entry = [] for name in hndlr_lst: for tweet in tweepy.Cursor( api.search, q=name, count=100, # lang="en", since=datetime.now().strftime("%Y-%m-%d"), since_id=pre_id, # max_id = pre_id # until= datetime.now().strftime("%Y-%m-%d") ).items(): # # tweet URL tweet_url = f"https://twitter.com/" + tweet.user.screen_name + "/status/" + str( tweet.id) # google tranglater translator = Translator() trans = translator.translate(tweet.text).text # cleaning data process = p.clean(trans) process = re.sub(r':', '', process) process = re.sub(r'…', '', process) # vader sen_analyser = SentimentIntensityAnalyzer() polarity_scores = sen_analyser.polarity_scores(process) print(tweet.id) compnd = polarity_scores['compound'] if compnd >= 0.05: polarity = polarity_scores['pos'] polarity_type = "positive" elif compnd <= -0.05: polarity = polarity_scores['neg'] polarity_type = "negative" else: polarity = polarity_scores['neu'] polarity_type = "neutral" new_entry = [ tweet.created_at, tweet.id, tweet.user.screen_name, tweet_url, tweet.text, trans, process, polarity, polarity_type ] # print(new_entry) single_tweet_df = pd.DataFrame([new_entry], columns=COLS) df_final = df.append(single_tweet_df, ignore_index=True) df = pd.DataFrame(data=df_final, columns=COLS) df.to_csv(file_name) # print("Got all the tweet.") except tweepy.TweepError as e: print(str(e)) print("Something went wrong.")
def features(self, source, reply): tokenizer = PreprocessTwitter() featset = [] hc = [] c = 0 if "text" not in source.keys(): s_text_raw = source["full_text"].lower() else: s_text_raw = source["text"].lower() if "text" not in reply.keys(): r_text_raw = reply["full_text"].lower() else: r_text_raw = reply["text"].lower() s_text = tokenizer.tokenize(s_text_raw) r_text = tokenizer.tokenize(r_text_raw) s_id = source["id"] r_id = reply["id"] s_words = s_text.split(" ") r_words = r_text.split(" ") s_vector = self.tweet_vector(s_words) r_vector = self.tweet_vector(r_words) #similarity between source and reply sourceSim = cosine(s_vector, r_vector) hc.append(sourceSim) ###has url? url = 0. if "<url>" in r_text: url = 1. hc.append(url) ###ends with question mark ewqm = 0. if r_text[-1] == "?": ewqm = 1. hc.append(ewqm) ###is reply? is_reply = 0. if s_id != r_id: is_reply = 1. hc.append(is_reply) ###supporting similarity sup_vector = self.tweet_vector(self.support_terms) supSim = cosine(r_vector, sup_vector) hc.append(supSim) ###contains Wh question wh = 0. if "who" in r_words or "where" in r_words or "why" in r_words or "what" in r_words: wh = 1. hc.append(wh) ###dontyou dontyou = 0. if "don't you" in r_text: dontyou = 1. hc.append(dontyou) ###arentyou arentyou = 0. if "aren't you" in r_text: arentyou = 1. hc.append(arentyou) ###has replies has_replies = 0. if reply["in_reply_to_status_id"] != "null": has_replies = 1. hc.append(has_replies) ###sentiment features (Vader) #analyser = SentimentIntensityAnalyzer() #score = analyser.polarity_scores(r_text_raw) #hc.append(score["neg"]) #hc.append(score["pos"]) #hc.append(score["neu"]) #hc.append(score["compound"]) #score = analyser.polarity_scores(s_text_raw) #hc.append(score["neg"]) #hc.append(score["pos"]) #hc.append(score["neu"]) #hc.append(score["compound"]) sentAnalyser = SentimentIntensityAnalyzer() scores = sentAnalyser.polarity_scores(r_text_raw) hc.append(scores["neg"]) hc.append(scores["pos"]) hc.append(scores["neu"]) hc.append(scores["compound"]) scores = sentAnalyser.polarity_scores(s_text_raw) hc.append(scores["neg"]) hc.append(scores["pos"]) hc.append(scores["neu"]) hc.append(scores["compound"]) ###TODO: check negation features -- Stanford ###has negation ###average negation ###has slang/curse word hasVulgar = 0. ###has Google bad word hasGoogleBadWords = 0. ###has acronyms hasAcro = 0. ###average word length wordCount = 0. wordLen = 0. avWL = 0. for token in r_words: if token[0] != "<" and token[-1] != ">": wordLen = wordLen + float(len(token)) wordCount = wordCount + 1. if token in self.acronyms: hasAcro = 1. if token in self.vulgarWords: hasVulgar = 1. if token in self.googleBadWords: hasGoogleBadWords = 1. if wordCount != 0.: avWL = wordLen/wordCount hc.append(avWL) hc.append(hasAcro) hc.append(hasVulgar) hc.append(hasGoogleBadWords) ###surprise score surprise_vector = self.tweet_vector(self.surpriseWords) surprise_score = cosine(r_vector, surprise_vector) hc.append(surprise_score) ###doubt Score doubt_vector = self.tweet_vector(self.doubtWords) doubt_score = cosine(r_vector, doubt_vector) hc.append(doubt_score) ###nodoubt score no_doubt_vector = self.tweet_vector(self.noDoubtWords) no_doubt_score = cosine(r_vector, no_doubt_vector) hc.append(no_doubt_score) ###has question mark ###number question mark numberQM = r_text_raw.count("?") hasQM = 0. if numberQM > 0: hasQM = 1. hc.append(hasQM) hc.append(float(numberQM)) ###has exclamation mark ###number exclamation mark numberEM = r_text_raw.count("!") hasEM = 0. if numberEM > 0: hasEM = 1. hc.append(hasEM) hc.append(float(numberEM)) ###has dot dot dot ###number dot dot dot numberDDD = r_text_raw.count("...") hasDDD = 0. if numberDDD > 0: hasDDD = 1. hc.append(hasDDD) hc.append(float(numberDDD)) ###originality (tweets counts) tweet_count = float(reply["user"]["statuses_count"]) hc.append(tweet_count) ###is Verified isVerified = 0. if reply["user"]["verified"] != "false": isVerified = 1. hc.append(isVerified) ###number of followers followers = float(reply["user"]["friends_count"]) hc.append(followers) ###role followees = float(reply["user"]["followers_count"]) role = 0. if followees != 0.: role = followers/followees hc.append(role) #TODO: understand engagement features #import time #res = time.strptime(reply["user"]["created_at"], "%a %b %d %H:%M:%S +0000 %Y") #print(res) ###engagement ###engagement favorite ###public list membership count public_list = float(reply["user"]["listed_count"]) hc.append(public_list) ###has geo enabled geo = 0. if reply["user"]["geo_enabled"] != "false": geo = 1. hc.append(geo) ###has description description = reply["user"]["description"] hasDesc = 0. len_desc = 0. if (description != None) and (description.strip() != ""): hasDesc = 1. len_desc = float(len(description.split(" "))) hc.append(hasDesc) ###length of description hc.append(len_desc) ###pattern 1 pattern1 = Heuristics.pattern1(r_text_raw) hc.append(pattern1) ###pattern 2 pattern2 = Heuristics.pattern2(r_text_raw) hc.append(pattern2) ###pattern 3 pattern3 = Heuristics.pattern3(r_text_raw) hc.append(pattern3) ###pattern 4 pattern4 = Heuristics.pattern4(r_text_raw) hc.append(pattern4) ###pattern 5 pattern5 = Heuristics.pattern5(r_text_raw) hc.append(pattern5) ###pattern 6 pattern6 = Heuristics.pattern6(r_text_raw) hc.append(pattern6) ###pattern 7 pattern7 = Heuristics.pattern7(r_text_raw) hc.append(pattern7) ###pattern 8 pattern8 = Heuristics.pattern8(r_text_raw) hc.append(pattern8) ###pattern 9 pattern9 = Heuristics.pattern9(r_text_raw) hc.append(pattern9) ###pattern 10 pattern10 = Heuristics.pattern10(r_text_raw) hc.append(pattern10) ###emoticons r_text_noURL = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>", r_text_raw, flags=re.MULTILINE | re.DOTALL) for emo in self.emoticons.keys(): if emo in r_text_noURL: self.emoticons_cat[self.emoticons[emo]] = 1. for emo_cat in self.emoticons_cat: hc.append(emo_cat) hc_vector = np.array(hc) aux_vector = np.append(s_vector, r_vector) final_vector = np.append(aux_vector, hc_vector) final_vector_scaler = self.scaler.transform(final_vector.reshape(1, -1)) return final_vector_scaler
def sentiment_analysis(message): sia = SIA() p_score = sia.polarity_scores(message) p = p_score['compound'] speedometer(p)
def get_sentiments(text): analyzer = SentimentIntensityAnalyzer() return analyzer.polarity_scores(text)
lambda x: " ".join(word for word in x.split() if word not in n_req)) ## subjectvity & polarity of each review rows...## from textblob import TextBlob dataset['polarity'] = dataset['review'].apply( lambda x: TextBlob(x).sentiment.polarity) dataset['subjectivity'] = dataset['review'].apply( lambda x: TextBlob(x).sentiment.subjectivity) ## Finding sentiment through VADER sentiment Analyzer..## import nltk nltk.download('vader_lexicon') from nltk.sentiment.vader import SentimentIntensityAnalyzer sid = SentimentIntensityAnalyzer() sid.polarity_scores(dataset.iloc[4]['review']) dataset['vad_scores'] = dataset['review'].apply( lambda review: sid.polarity_scores(review)) dataset['vad_compound'] = dataset['vad_scores'].apply(lambda d: d['compound']) ########...finding Correlation in the data....### corrmat = dataset.corr() print(corrmat) ## ....Finding most common occuring words in Corpus...## review_str = " ".join(dataset.review) text = review_str.split() from collections import Counter counter = Counter(text)
def main(): # Initialize objects gen = document_generator() nlp = spacy.load('en', disable=['tagger', 'parser', 'textcat']) dic = pyphen.Pyphen(lang='en') tok = RegexpTokenizer('\w+') sid = SentimentIntensityAnalyzer() genre_result = [] # Tagsets POS_tags = {"''", '(', ')', ',', '--', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``', '$', '#'} entity_tags = {'PERSON', 'NORP', 'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL', 'FAC'} # Generate features # TODO When the generator yields 0 save the progress for file in gen: # Save file if all the documents from a genre are generated if 'genre' == file[0]: genre_result = np.array(genre_result) genre_result = pd.DataFrame(data=genre_result) genre_result.to_csv('/datastore/10814418/preprocessed_' + str(file[1]) + '.csv', index=False) genre_result = [] continue # Check file if the file is non-empty and name variables text = file[0] info = np.array([file[2], file[1]]) index = file[3] if len(text) < min_len: continue # POS-tags empty_counter = {key: 0 for key in POS_tags} tags = nltk.pos_tag(nltk.word_tokenize(text)) tags_counter = Counter(tag for w,tag in tags) final_dict = {**empty_counter, **dict(tags_counter)} sorted_items = sorted(final_dict.items()) keys = [item[0] for item in sorted_items] tag_count = np.array([item[1] for item in sorted_items]) # Entities empty_counter = {key: 0 for key in entity_tags} doc = nlp(text) entity_counter = Counter(ent.label_ for ent in doc.ents) final_dict = {**empty_counter, **dict(entity_counter)} sorted_items = sorted(final_dict.items()) ent_count = np.array([item[1] for item in sorted_items]) # Sentence, word and syllable count n_sent = len(nltk.sent_tokenize(text)) words = tok.tokenize(text) n_word = len(words) syllables = [dic.inserted(word) for word in words] syllable_list = [len(re.findall('-', word)) + 1 for word in syllables] n_syl = sum(syllable_list) syntax_count = np.array([n_sent, n_word, n_syl]) # Readability score flesh = 206.835-1.015*(n_word/n_sent)-84.6*(n_syl/n_word) flesh_kincaid = 0.39*(n_word/n_sent)+11.8*(n_syl/n_word)-15.59 readability_score = np.array([flesh, flesh_kincaid]) # Sentiment score_dic = sid.polarity_scores(text) sentiment = np.array([score_dic['neg'], score_dic['neu'], score_dic['pos'], score_dic['compound']]) # Concat all features instance_result = np.concatenate([tag_count, ent_count, syntax_count, readability_score, sentiment, info]) genre_result.append(instance_result)
data['label'].value_counts() data.dropna(inplace=True) empty = [] for i, lb, rv in data.itertuples(): if (type(rv) == str): if rv.isspace(): empty.append(i) data.drop(empty, inplace=True) data['label'].value_counts() sid.polarity_scores(data.loc[0]['review']) data['scores'] = data['review'].apply( lambda review: sid.polarity_scores(review)) data.head() data['compound'] = data['scores'].apply( lambda score_dict: score_dict['compound']) data.head() data['comp_score'] = data['compound'].apply(lambda c: 'pos' if c >= 0 else 'neg') data.head() from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
def getSentiment() : with open("data/sample_twitter_data_2020-06-20_to_2020-08-06.pk",'rb') as f : conora19_content = pickle.load(f) conora19_content = [str(doc) for doc in conora19_content] ### LIST(str) 을 변환시키는 부분 print(type(conora19_content)) #### "https://" 제거하기 pattern2 = re.compile(r"\b(https?:\/\/)?([\w.]+){1,2}(\.[\w]{2,4}){1,2}(.*)") clean_conora19_content = [pattern2.sub("", doc) for doc in conora19_content] ##### "\n" 제거하기 pattern1 = re.compile("\n") clean_conora19_content = [pattern1.sub("",doc) for doc in clean_conora19_content] #### "특수문자" 제거하기 ex> ?,*, !...... pattern4 = re.compile("[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]") ##특수 문자 제거 clean_conora19_content = [pattern4.sub("",doc) for doc in clean_conora19_content] ######### english 보기 stops = set(stopwords.words("english")) ######## customer 단어도 정제하기 (일반적인 단어가 아니라,불필요한 단어 찾기 ) stops.add('The') stops.add('said') stops.add('people') stops.add('also') stops.add('would') stops.add('\n') stops.add('ANALYSIS/OPINION:') clean_conora19_content = [word for word in clean_conora19_content if word not in stops and len(word) > 1] clean_conora19 = [] ##### 대문자를 소문자로 바뀌기 for text in clean_conora19_content : clean_conora19.append(text.lower()) ## lower() 소문자로 변환하기 print(clean_conora19[1]) ########### 감성 분류 nltk.download('vader_lexicon') ## 감성적인 분류 Lib 가져오기 sid = SentimentIntensityAnalyzer() ### NLTK Sentiment class 만들기 sentiment = sid.polarity_scores(clean_conora19[1]) ## 기사 contents 하나만 적용하기 print("neg sum:{}, neu sum:{}, pos sum:{}".format(sentiment['neg'],sentiment['neu'],sentiment['pos'])) #### 기사 전체 sentiment 적용하고 저장하기 total_sentiment = [] for content in clean_conora19 : total_sentiment.append(sid.polarity_scores(content)) #### 기사 전체 Sentiment 합계 구하기 total_neg = 0.0 total_neu = 0.0 total_pos = 0.0 print("len:",len(total_sentiment)) for sentiment in total_sentiment : total_neg = total_neg + float(sentiment['neg']) total_neu = total_neu + float(sentiment['neu']) total_pos = total_pos + float(sentiment['pos']) print("neg sum:{}, neu sum:{}, pos sum:{}".format(total_neg/len(total_sentiment),total_neu/len(total_sentiment),total_pos/len(total_sentiment))) return [len(total_sentiment),total_neg/len(total_sentiment),total_neu/len(total_sentiment),total_pos/len(total_sentiment)] #if __name__ == '__main__': # sentiment = getSentiment()
time_main = [] time_sub = [] for order in range(count, len(test)): # add the time so when there is no comment, the program won't cause mistakes time_main.append(covert_to_unixtime(test.iloc[order].creat_time)) time_sub.append(covert_to_unixtime(test.iloc[order].creat_time)) # deal with the main comment if (test.iloc[order].title == disgussion.title and test.iloc[order].creat_time == disgussion.creat_time and test.iloc[order]['main_comment'] != 0): #this print is used to debug, in order to understand where the bug is, the same with the prints beblow print('round one', order) #get the score of the comment sentiment_mm.append( sid.polarity_scores( test.iloc[order]['main_comment'])['compound']) #add the word in to mimic box row_mimic.append(test.iloc[order]['sub_comment']) main = main + ' ' + test.iloc[order].main_comment if (test.iloc[order]['main_creat'] != 0): time_main.append( covert_to_unixtime(test.iloc[order].main_creat)) else: time_main.append( covert_to_unixtime(test.iloc[order].creat_time)) #deal with the sub comment if (test.iloc[order].title == disgussion.title and test.iloc[order].creat_time == disgussion.creat_time and test.iloc[order]['sub_comment'] != 0): print('round two', order) #the socre of the sub comment
def get_sentiments(text): nltk.download('vader_lexicon') analyzer = SentimentIntensityAnalyzer() return analyzer.polarity_scores(text)
print(clean) clean_review.append(clean) with open('Review.txt', "w+", encoding=('utf-8')) as filehandle: filehandle.writelines("%s\n" % review for review in clean_review) import pandas as pd from nltk.sentiment.vader import SentimentIntensityAnalyzer sia = SentimentIntensityAnalyzer() df = pd.DataFrame(clean_review, columns=['Reviews']) reviews = df reviews['neg'] = reviews['Reviews'].apply( lambda x: sia.polarity_scores(x)['neg']) reviews['neu'] = reviews['Reviews'].apply( lambda x: sia.polarity_scores(x)['neu']) reviews['pos'] = reviews['Reviews'].apply( lambda x: sia.polarity_scores(x)['pos']) reviews['compound'] = reviews['Reviews'].apply( lambda x: sia.polarity_scores(x)['compound']) star5 = [ j for i, j in enumerate(reviews['Reviews']) if 1 >= reviews['compound'][i] > 0.6 ] star4 = [ j for i, j in enumerate(reviews['Reviews']) if 0.6 >= reviews['compound'][i] > 0.2
def store(tags): try: tso = TwitterSearchOrder() tso.set_keywords(tags, or_operator=True) sid = SentimentIntensityAnalyzer() # Provides the wrapper with the necessary data for making the calls and retrieving the data ts = TwitterSearch(consumer_key=key, consumer_secret=secret, access_token=token_key, access_token_secret=token_secret) tweet_id_array = [ ] # using array instead of calling twitter search again, to make it more time efficient count = 0 for tweet in ts.search_tweets_iterable(tso): count += 1 if (tweet['user']['location'][0:8].lower() == 'edmonton'): tweet_id_array.append(tweet['id']) ss = sid.polarity_scores(tweet['text']) u = mod.User(tweet['user']['id'], tweet['user']['screen_name'], tweet['user']['name'], tweet['user']['followers_count'], tweet['user']['favourites_count'], tweet['user']['friends_count'], tweet['user']['created_at'], timezone.now(), tweet['user']['statuses_count']) mod.User.insert_user(tweet['user']['id'], tweet['user']['screen_name'], tweet['user']['name'], tweet['user']['followers_count'], tweet['user']['favourites_count'], tweet['user']['friends_count'], tweet['user']['created_at'], tweet['user']['statuses_count']) mod.Tweet.insert_tweet( tweet['id'], tweet['text'], tweet['created_at'], tweet['favorite_count'], tweet['retweet_count'], tweet['in_reply_to_status_id'], tweet['lang'], u, ss['compound'], ss['pos'], ss['neg'], ss['neu'], get_sentiment_string(ss['compound']), 'retweeted_status' in tweet) hashtags_list = tweet['entities']['hashtags'] # Add the hashtags and duplicates are not added for hashtag in hashtags_list: mod.Hashtag.insert_hashtag(tweet['id'], hashtag['text'].lower()) # count and save the rep_count after all the tweet data is saved and updated in database for tweetid in tweet_id_array: rp_count = Tweet.objects.filter(tid_parent=tweetid).count() mod.Tweet.insert_replycount(tweetid, rp_count) except (TwitterSearchException, ConnectionError ) as e: # take care of all those ugly errors if there are some print 'Exception:', e print count, 'tweets received'
title = row.a.text date_data = row.td.text.split(' ') if len(date_data) == 1: time = date_data[0] else: date = date_data[0] time = date_data[1] parsed_data.append([ticker, date, time, title]) # print(parsed_data) df = pd.DataFrame(parsed_data, columns=['ticker', 'date', 'time', 'title']) vader = SentimentIntensityAnalyzer() f = lambda title: vader.polarity_scores(title)['compound'] df['compound'] = df['title'].apply(f) df['date'] = pd.to_datetime(df.date).dt.date plt.figure(figsize=(10, 8)) mean_df = df.groupby(['ticker', 'date']).mean() mean_df = mean_df.unstack() mean_df = mean_df.xs('compound', axis="columns").transpose() mean_df.plot(kind='bar') plt.show() #print(mean_df) #print(vader.polarity_scores("I don't think Apple is a good company. I think they will do poolry this quarter."))
def main(): demo = 'd286f23fd3d3c4fbd6cc5768c2a6388d' #data = read_csv('/Users/alenshaju/Downloads/SP500_tickers_100.csv') #companies = data['Ticker'].to_list()[:10] consumer_companies = [ 'TJX', 'NKE', 'TGT', 'HD', 'LOW', 'PG', 'WMT', 'COST', 'MDLZ', 'EL', 'KO', 'PEP', 'PM', 'MO', 'BKNG', 'MCD', 'SBUX' ] energy_companies = ['NEE', 'XOM', 'CVX'] fig_companies = [ 'BLK', 'AXP', 'V', 'MA', 'PYPL', 'FIS', 'JPM', 'BAC', 'WFC', 'USB', 'SPGI', 'MS', 'SCHW', 'GS', 'BRK.B', 'AMT' ] #C healthcare_companies = [ 'ABBV', 'AMGN', 'GILD', 'ABT', 'DHR', 'MDT', 'SYK', 'ISRG', 'CVS', 'CI', 'TMO', 'UNH', 'ANTM', 'JNJ', 'PFE', 'LLY', 'BMY' ] industrials_companies = [ 'BA', 'RTX', 'LMT', 'DE', 'UPS', 'TSLA', 'GM', 'CAT', 'HON', 'GE', 'MMM', 'LIN', 'UNP' ] tech_companies = [ 'ADBE', 'CRM', 'INTU', 'GOOG', 'GOOG.L', 'FB', 'AMZN', 'ACN', 'IBM', 'AMAT', 'LRCX', 'NVDA', 'INTC', 'AVGO', 'TXN', 'QCOM', 'MU', 'AMD', 'MSFT', 'ORCL', 'NOW', 'AAPL' ] mt_companies = ['CMCS.A', 'CHTR', 'CSCO', 'VZ', 'T', 'DIS', 'NFLX'] companies = ['UAL'] past_call_dict = {} yec = YahooEarningsCalendar() for company in companies: print("Ticker:", company) past_calls_df = get_past_earnings_call(yec, company) past_call_dict[company] = past_calls_df df_returns_scores = pd.DataFrame(columns=['Return', 'Score']) sia = SentimentIntensityAnalyzer() d = {} with open( "/Users/alenshaju/Downloads/LoughranMcDonald_MasterDictionary_2018.txt" ) as f: for line in f: (key, val) = line.split() d[key] = float(val) sia.lexicon.update(d) excel_df = pd.DataFrame( columns=['Ticker', 'Quarter', 'Sentiment Score', 'Returns']) for company in companies: print("For company: ", company) for i, row in past_call_dict[company].iterrows(): date = datetime.datetime.strptime(row['startdatetime'], '%Y-%m-%dT%H:%M:%S.%fZ') quarter = pd.Timestamp(date).quarter year = date.year if year <= datetime.datetime.now().year: if year == datetime.datetime.now().year: if quarter >= pd.Timestamp( datetime.datetime.now()).quarter: continue transcript = requests.get( f'https://financialmodelingprep.com/api/v3/earning_call_transcript/{company}?quarter={quarter}&year={year}&apikey={demo}' ).json() if len(transcript) == 0: continue transcript = transcript[0]['content'].split('\n') if not bool(len(pd.bdate_range(date, date))): date = date - BDay(1) if (date + BDay(1)) in get_trading_close_holidays(year): end_date = date + BDay(1) else: end_date = date stock = yf.download(company, start=date, end=end_date + BDay(1) + datetime.timedelta(1), progress=False) price_change_rate = (stock['Adj Close'][1] / stock['Adj Close'][0]) - 1 price_change_percent = price_change_rate * 100 sentiment_score = sia.polarity_scores( transcript[0])['pos'] - sia.polarity_scores( transcript[0])['neg'] print(transcript) print('score: ', sia.polarity_scores(transcript[0])) print("price change: ", price_change_rate) df_returns_scores = df_returns_scores.append( { 'Return': price_change_rate, 'Score': sentiment_score }, ignore_index=True) excel_df = excel_df.append( { 'Ticker': company, "Date": date, 'Quarter': quarter, 'Sentiment Score': sentiment_score, 'Returns': price_change_rate }, ignore_index=True) if i > 8: # 10years - 4 quarters break excel_df.to_excel("/Users/alenshaju/Downloads/mt_excel_file_v1.xlsx") x = df_returns_scores.Score.values.reshape(-1, 1) y = df_returns_scores.Return.values.reshape(-1, 1) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=42) support_vector_reg_model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1) support_vector_reg_model.fit(x_train, y_train) y_pred = support_vector_reg_model.predict(x_test) mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) r2_data = r2_score(y_test, y_pred) print("Root mean square error: ", rmse) print("R^2 score: ", r2_data) train_test_label = ['Training Data', 'Testing Data'] model_color = ['m', 'c', 'g'] fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 10), sharey=True) ###### Training Data ########## axes[0].plot(x_test, y_pred, color=model_color[0], lw=2, label='{} model'.format(train_test_label[0])) axes[0].scatter(x_train[np.setdiff1d(np.arange(len(x_train)), support_vector_reg_model.support_)], y_train[np.setdiff1d(np.arange(len(x_train)), support_vector_reg_model.support_)], facecolor="none", edgecolor=model_color[0], s=50, label='Training data') axes[0].legend(loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=1, fancybox=True, shadow=True) ####### Testing Data ######### axes[1].plot(x_test, y_pred, color=model_color[1], lw=2, label='{} model'.format(train_test_label[1])) axes[1].scatter(x_test[np.setdiff1d(np.arange(len(x_test)), support_vector_reg_model.support_)], y_pred[np.setdiff1d(np.arange(len(x_test)), support_vector_reg_model.support_)], facecolor="none", edgecolor=model_color[1], s=50, label='Testing data') axes[1].legend(loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=1, fancybox=True, shadow=True) fig.text(0.5, 0.04, 'data', ha='center', va='center') fig.text(0.06, 0.5, 'target', ha='center', va='center', rotation='vertical') fig.suptitle("Support Vector Regression", fontsize=14) plt.show()
def initial(): spotify = spotipy.Spotify(auth=token) spotify.current_user_recently_played = types.MethodType( current_user_recently_played, spotify) # creating .json file recentsongs = spotify.current_user_recently_played(limit=10) track_details = [] # creating arrays for storing id and name for i in recentsongs['items']: temp = {'name': '', 'artist': ''} temp['name'] = i['track']['name'] temp['artist'] = i['track']['artists'][0]['name'] track_details.append(temp) lyrics = {} text = [] compoundscore = [] sid = SentimentIntensityAnalyzer() track_details = {frozenset(item.items()): item for item in track_details}.values() print(track_details) for i in track_details: song = genius.search_song(i['name'], i['artist']) songlyrics = song.lyrics.replace("\n", " ").replace("\\'", "\'") lyrics[i['name']] = songlyrics songlyrics = songlyrics.replace('(', '').replace(')', '') songlyrics = re.sub("[\\[].*?[\\]]", "", songlyrics) text.append(songlyrics) scores = sid.polarity_scores(songlyrics) compoundscore.append(scores['compound']) text = ' '.join(map(str, text)) print(text.encode("utf-8")) stpwords = set(stopwords.words('english')) stpwords.update(["br", "href", "la", "yeah", "yuh", "wan", "i'm"]) sentences = sent_tokenize(text) words = word_tokenize(text) words_no_punc = [] for w in words: if w.isalpha(): words_no_punc.append(w.lower()) ps = PorterStemmer() clean_words = [] for w in words_no_punc: if w not in stpwords: clean_words.append(ps.stem(w)) fdist = FreqDist(clean_words) print(fdist.most_common(10)) fdist.plot(10) words_string = ' '.join(map(str, clean_words)) wordcloud = WordCloud(stopwords=stpwords).generate(words_string) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") p1 = plt.show() print(compoundscore) plt.plot(compoundscore) p2 = plt.show() pos_count = 0 neg_count = 0 for num in compoundscore: if num >= 0: pos_count += 1 else: neg_count += 1 plt.pie([pos_count, neg_count], labels=["Positive Songs", "Negative Songs"]) p3 = plt.show() p1 p2 p3
class FeatureExtractor: def __init__(self): nltk.download('vader_lexicon') nltk.download('punkt') self.tool = spacy.load("en_core_web_lg") self.sent_analyzer = SentimentIntensityAnalyzer() self.stemmer = PorterStemmer() self.badwords = [] self.negative_smileys = [] self.positive_smileys = [] with open(settings.get_badwords(), 'r') as f: for line in f: self.badwords.append(line.strip().lower()) with open(settings.get_negative_smileys(), 'r') as f: for line in f: self.negative_smileys.append(line.strip()) with open(settings.get_positive_smileys(), 'r') as f: for line in f: self.positive_smileys.append(line.strip()) self.negationwords = [ 'not', 'no', 'nobody', 'nothing', 'none', 'never', 'neither', 'nor', 'nowhere', 'hardly', 'scarcely', 'barely', 'don', 'isn', 'wasn', 'shouldn', 'wouldn', 'couldn', 'doesn' ] self.whwords = [ 'what', 'when', 'where', 'which', 'who', 'whom', 'whose', 'why', 'how' ] self.entity_annotations = [ '__PERSON__', '__NORP__', '__FAC__', '__ORG__', '__GPE__', '__LOC__', '__PRODUCT__', '__EVENT__', '__WORK_OF_ART__', '__LAW__', '__LANGUAGE__', '__DATE__', '__TIME__', '__PERCENT__', '__MONEY__', '__QUANTITY__', '__ORDINAL__', '__CARDINAL__' ] def extract_lemma(self): pass def extract_ne(self, text: str) -> str: ''' This function extracts the name entities, and then replace them with the ne labels. :param text: :type text: :return: :rtype: ''' doc = self.tool(text) for ent in doc.ents: text = text.replace(ent.text, ' __' + ent.label_ + '__ ') return text # todo: this method is overshadowed by the exact same one in the bottom. Is this intended? def sentence_embeddings(self, text): text = self.preprocess(text) vector = self.tool(text).vector return vector def emoji(self, text: str) -> str: pass '@todo' def filter_hashtag(self, text: str) -> str: ''' This function extracts hashtags and then replace with <hashtag> :param text: :type text: :return: :rtype: ''' text = re.sub(r"^#.*", '__hashtag__', text) return [text, text.count('__hashtag__')] '@todo' def extract_mention(self, text: str) -> str: ''' This function extracts mentions and the replace with <mention> :param text: :type text: :return: :rtype: ''' return re.sub(r"^@.*", ' __usermention__ ', text) def post_role(self, post): ''' if it is source return 1, if it is reply return 0 :return: :rtype: ''' return post.has_source_depth == 0 '@todo make callback functions as input' def preprocess(self, text: str) -> str: space_pattern = '\s+' text = re.sub(space_pattern, ' ', text) text = self.filter_hashtag(text)[0] text = self.filter_url(text)[0] text = self.filter_mention(text)[0] text = re.sub('\'', ' ', text) text = re.sub('____', '__ __', text) text = self.extract_ne(text) return str(text) def tokenize(self, text): """Removes punctuation & excess whitespace, sets to lowercase, and stems tweets. Returns a list of stemmed tokens.""" tweet = " ".join(re.split("[^a-zA-Z]*", text.lower())).strip() tokens = [self.stemmer.stem(t) for t in tweet.split()] return tokens def similarity(self, post_1_id, post_2_id, embeddings): dist = distance.cosine(embeddings[post_1_id], embeddings[post_2_id]) return dist def filter_url(self, text): giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|' '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') text = re.sub(giant_url_regex, '__url__', text) return [text, text.count('__url__')] def filter_mention(self, text): mention_regex = '@[\w\-]+' text = re.sub(mention_regex, '__usermention__', text) return [text, text.count('__usermention__')] def has_badword(self, tokens): bad_words = 0 for token in tokens: if token in self.badwords: bad_words += 1 return bad_words / len(self.badwords) def has_negation(self, tokens): negation_words = 0 for negationword in self.negationwords: if negationword in tokens: negation_words += 1 return negation_words / len(self.negationwords) def has_smileys(self, text): if len(text) == 0: return [0, 0] positive_smileys = 0 negative_smileys = 0 for smiley in self.positive_smileys: if smiley in text: positive_smileys += text.count(smiley) for smiley in self.negative_smileys: if smiley in text: negative_smileys += text.count(smiley) return [positive_smileys / len(text), negative_smileys / len(text)] def has_whwords(self, tokens): wh_words = 0 for token in tokens: if token in self.whwords: wh_words += 1 return wh_words / len(self.whwords) def other_tokenizer(self, text): return nltk.word_tokenize(re.sub(r'([^\s\w]|_)+', '', text.lower())) def check_entities(self, text): entity_feats = [] for annotation in self.entity_annotations: entity_feats.append( 1) if annotation in text else entity_feats.append(0) return entity_feats def extract_aux_feats(self, item, args, source_id, prev_id, embeddings, post_type): aux_feats = [] text = self.preprocess(item['text']) if 'post_role' in args: aux_feats.append(post_type) if 'sentiment_analyzer' in args: aux_feats.append(self.sent_analyzer.polarity_scores(text)['pos']) aux_feats.append(self.sent_analyzer.polarity_scores(text)['neu']) aux_feats.append(self.sent_analyzer.polarity_scores(text)['neg']) aux_feats.append( self.sent_analyzer.polarity_scores(text)['compound']) if 'similarity' in args: if item['id'] == source_id: aux_feats.append(1) else: aux_feats.append( self.similarity(item['id'], source_id, embeddings)) if 'num_url' in args: aux_feats.append(self.filter_url(item['text'])[1]) if 'num_mention' in args: aux_feats.append(self.filter_mention(item['text'])[1]) if 'num_hashtag' in args: aux_feats.append(self.filter_hashtag(item['text'])[1]) tokens = self.other_tokenizer(item['text']) if 'badwords' in args: aux_feats.append(self.has_badword(tokens)) if 'hasnegation' in args: aux_feats.append(self.has_negation(tokens)) if 'whwords' in args: aux_feats.append(self.has_whwords(tokens)) if 'qmark' in args: aux_feats.append(1 if len(text) and '?' in text else 0) if 'excmark' in args: aux_feats.append(1 if len(text) and '!' in text else 0) if 'tripdot' in args: aux_feats.append(1 if len(text) and '...' in text else 0) if 'capital' in args: aux_feats.append( float(sum(1 for c in text if c.isupper())) / len(text)) if len(text) > 0 else aux_feats.append(0) if 'smileys' in args: aux_feats.append(self.has_smileys(text)) if 'named_entities' in args: for score in self.check_entities(text): aux_feats.append(score) return np.asarray(aux_feats) def sentence_embeddings(self, text): text = self.preprocess(text) return self.tool(text).vector