# fit_transform() does two functions: First, it fits the model and learns the vocabulary; # Second, it transforms our training data into feature vectors. The input to fit_transform should be a list of strings. train_data_features = vectorizer.fit_transform(clean_train_reviews) # Numpy arrays are easy to work with, so convert the result to an array train_data_features = train_data_features.toarray() import numpy as np vocab = vectorizer.get_feature_names() print vocab # Sum up the counts of each vocabulary word dist = np.sum(train_data_features, axis=0) # For each, print the vocabulary word and the number of times it # appears in the training set for tag, count in zip(vocab, dist): print count, tag from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA sid=SIA() sentiment_score=[] for i in xrange( 0, num_reviews ): # If the index is evenly divisible by 10, print a message if((i+1)%1000== 0): print "Review %d of %d\n" % ( i+1, num_reviews) sentiment_score.append(sid.polarity_scores(train["text"][i]))
def twitter(): if request.method == "POST": auth = tw.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tw.API(auth, wait_on_rate_limit=True) query = request.form.get("search") date_since = "2019-11-20" new_search = query + " -filter:retweets" tweets = tw.Cursor(api.search, lang="en", q=new_search, tweet_mode='extended', since=date_since).items(100) users_locs = [[ tweet.user.location, tweet.created_at, tweet.full_text.replace('\n', ' '), tweet.user.screen_name, [e['text'] for e in tweet._json['entities']['hashtags']], tweet.user.followers_count ] for tweet in tweets] tweet_text = pd.DataFrame(data=users_locs, columns=[ "location", "Tweetcreated", 'tweet', "username", "Hastage", "follower_count" ]) #tweet_text = pd.read_csv('tweet_test.csv') if "Unnamed: 0" in tweet_text.columns: tweet_text.drop("Unnamed: 0", axis=1, inplace=True) tweet_text.dropna(subset=['tweet'], axis=0, inplace=True) tweet_text.reset_index(drop=True, inplace=True) sia = SIA() tweet_text['compound'] = None tweet_text['label'] = 0 for i in range(len(tweet_text.index)): pol_score = sia.polarity_scores(tweet_text["tweet"][i]) tweet_text.iloc[i:i + 1, 6:7] = pol_score['compound'] tweet_text.loc[tweet_text['compound'] > 0.2, 'label'] = 1 tweet_text.loc[tweet_text['compound'] < -0.2, 'label'] = -1 to_drop = tweet_text[tweet_text["label"] == 0].index tweet_text.drop(to_drop, inplace=True) tweet_text.reset_index(drop=True, inplace=True) tweet_text['norm_follower'] = None tweet_text['norm_follower'] = (tweet_text['follower_count'] - tweet_text['follower_count'].min()) / ( tweet_text['follower_count'].max() - tweet_text['follower_count'].min()) to_drop = tweet_text[tweet_text["norm_follower"] == 0].index tweet_text.drop(to_drop, inplace=True) tweet_text.reset_index(drop=True, inplace=True) tweet_text[ 'score'] = tweet_text['norm_follower'] * tweet_text['compound'] tweet_text.sort_values(by=['score'], ascending=False, inplace=True) tweet_text["score"] = tweet_text['score'].map(lambda x: round(x, 5)) #tweet_text["together_found"] = 0 tweet_text.reset_index(inplace=True) tweet_pos = tweet_text[tweet_text["label"] == 1] tweet_neg = tweet_text[tweet_text["label"] == -1] #tweet_neg.sort_values(by=['score'],ascending=True,inplace=True) neg = tweet_neg[['tweet', 'username', 'score', 'location']].to_dict(orient='records') pos = tweet_pos[['tweet', 'username', 'score', 'location']].to_dict(orient='records') if (len(pos) > 15): tweet_display_pos = 15 else: tweet_display_pos = len(pos) if (len(neg) > 15): tweet_display_neg = 15 else: tweet_display_neg = len(neg) #raise Exception("hi",tweet_pos_score) return render_template("results_twitter.html", pos=pos, neg=neg, tweet_display_pos=tweet_display_pos, tweet_display_neg=tweet_display_neg) else: return render_template("twitter.html")
''' Analyzing Beauty dataset for sentiment ''' import pandas as pd import gzip from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA analyser = SIA() def parse(path): ''' Extract values from FILE which is in JSON format :param path: path to the file :return: None ''' g = gzip.open(path, 'rb') for l in g: yield eval(l) def getDF(path): ''' Creates a dataframe from original dataset and integrates all the reviews in one DF :param path: :return: dataframe df ''' i = 0 df = {} count = 0
def CreateFeatures1(raw_data, use_timer=False): """ This processes the post data, and can be used as a template for the upload data. """ import pandas as pd from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA if use_timer: my_timer = SimpleTimer() # is this training data, or web app data? If yes, process UTC code if len(raw_data.columns) > 10: # some of these columns aren't being used to create features yet extract_keys = ['num_comments', 'created_utc', 'title', 'selftext'] data = raw_data[sorted(extract_keys)].copy() # remove all rows without any valid text data = data[data.selftext.map(len) > 0] dates = pd.to_datetime(data.created_utc, unit="s") data['created_dayofweek'] = dates.dt.dayofweek data['created_hour'] = dates.dt.hour data['created_month'] = dates.dt.month data['created_year'] = dates.dt.year cut_off_year = 2011 data = data[data.created_year > cut_off_year] print('Removing all posts before 2011. ', end='') print('Earliest post = {}'.format( pd.to_datetime(data.created_utc.min(), unit="s"))) data = data.drop('created_utc', axis=1) else: data = raw_data data['post_char_len'] = data.selftext.apply(lambda x: len(x)) data['post_num_qs'] = data.selftext.apply(lambda x: x.count('?')) data['title_char_len'] = data.title.apply(lambda x: len(x)) data['title_num_qs'] = data.title.apply(lambda x: x.count('?')) data['post_word_len1'] = data.selftext.apply(lambda x: len(x.split())) data['title_word_len1'] = data.title.apply(lambda x: len(x.split())) if use_timer: my_timer.elapsed('Done with simple counts') def CountPostPunctuation(row): # count the number of punctuation in the selftext import string punc_set = set(string.punctuation) num_punc = 0 for char in row['selftext']: if char in punc_set: num_punc += 1 return num_punc def CountTitlePunctuation(row): # count the number of punctuation in the selftext import string punc_set = set(string.punctuation) num_punc = 0 for char in row['title']: if char in punc_set: num_punc += 1 return num_punc data['post_num_punc'] = data.apply(CountPostPunctuation, axis=1) data['title_num_punc'] = data.apply(CountTitlePunctuation, axis=1) data['post_perc_punc'] = data.post_num_punc / data.post_char_len data['title_perc_punc'] = data.title_num_punc / data.title_char_len data.post_perc_punc = data.post_perc_punc.fillna(0) if use_timer: my_timer.elapsed('Done with percentage counts') # Sentiment features sia_ps = SIA().polarity_scores posts_sent = [] for post in data.selftext: temp = sia_ps(post) posts_sent.append([temp[k] for k in sorted(temp.keys())]) titles_sent = [] for title in data.title: temp = sia_ps(title) titles_sent.append([temp[k] for k in sorted(temp.keys())]) data['title_compound'] = [t[0] for t in titles_sent] data['title_neg'] = [t[1] for t in titles_sent] data['title_neu'] = [t[2] for t in titles_sent] data['title_pos'] = [t[3] for t in titles_sent] data['post_compound'] = [t[0] for t in posts_sent] data['post_neg'] = [t[1] for t in posts_sent] data['post_neu'] = [t[2] for t in posts_sent] data['post_pos'] = [t[3] for t in posts_sent] if use_timer: my_timer.elapsed('Done') return data
def get_sentiments(text): sia = SIA() pol_score = sia.polarity_scores(text) return pol_score['compound'], pol_score['neg'], pol_score[ 'neu'], pol_score['pos']
def polarityScores(headline, body): sid = SIA() h = sid.polarity_scores(headline) b = sid.polarity_scores(body) return abs(h['neg'] - b['neg']), abs(h['neu'] - b['neu']), abs(h['pos'] - b['pos'])
def prepare_comments_df(all_comments_df): # apply cleaning on the comments all_comments_df[ 'comment_cleaned_with_emoji'] = all_comments_df.comment.apply( lambda row: process_comments(row, True)) # apply cleaning on the comments all_comments_df[ 'comment_cleaned_without_emoji'] = all_comments_df.comment.apply( lambda row: process_comments(row, False)) # get the length of the cleaned comment all_comments_df[ 'comment_cleaned_length_with_emoji'] = all_comments_df.comment_cleaned_with_emoji.apply( lambda row: len(row.split())) all_comments_df[ 'comment_cleaned_length_without_emoji'] = all_comments_df.comment_cleaned_without_emoji.apply( lambda row: len(row.split())) sia = SIA() all_comments_df['pos_sia_emoji'] = all_comments_df[ 'comment_cleaned_with_emoji'].apply( lambda comment: sia.polarity_scores(comment)['pos']) all_comments_df['neg_sia_emoji'] = all_comments_df[ 'comment_cleaned_with_emoji'].apply( lambda comment: sia.polarity_scores(comment)['neg']) all_comments_df['neu_sia_emoji'] = all_comments_df[ 'comment_cleaned_with_emoji'].apply( lambda comment: sia.polarity_scores(comment)['neu']) all_comments_df['agg_sia_emoji'] = all_comments_df[ 'comment_cleaned_with_emoji'].apply( lambda comment: sia.polarity_scores(comment)['compound']) all_comments_df['pos_sia_without_emoji'] = all_comments_df[ 'comment_cleaned_without_emoji'].apply( lambda comment: sia.polarity_scores(comment)['pos']) all_comments_df['neg_sia_without_emoji'] = all_comments_df[ 'comment_cleaned_without_emoji'].apply( lambda comment: sia.polarity_scores(comment)['neg']) all_comments_df['neu_sia_without_emoji'] = all_comments_df[ 'comment_cleaned_without_emoji'].apply( lambda comment: sia.polarity_scores(comment)['neu']) all_comments_df['agg_sia_without_emoji'] = all_comments_df[ 'comment_cleaned_without_emoji'].apply( lambda comment: sia.polarity_scores(comment)['compound']) all_comments_df['classification_with_emoji'] = all_comments_df.apply( lambda row: 'pos' if row['agg_sia_emoji'] >= 0 else 'neg', axis=1) all_comments_df['classification_without_emoji'] = all_comments_df.apply( lambda row: 'pos' if row['agg_sia_without_emoji'] >= 0 else 'neg', axis=1) all_comments_df['classification_with_emoji'].iplot( kind='hist', asFigure=True, theme='white', gridcolor='white', bargap=0.5, xTitle='classification', yTitle='count', title='Omega comments classification with emoji') all_comments_df['classification_without_emoji'].iplot( kind='hist', asFigure=True, theme='white', gridcolor='white', bargap=0.5, xTitle='classification', yTitle='count', title='Omega comments classification without emoji') # remove empty commenths all_comments_df = all_comments_df[ all_comments_df.comment_cleaned_length_without_emoji > 0] all_comments_df['comment date'] = pd.to_datetime( all_comments_df['comment date']) # remove hours, minutes, seconds all_comments_df["comment date day"] = all_comments_df[ "comment date"].dt.strftime("%m-%d-%y") all_comments_df["comment date hour"] = all_comments_df[ "comment date"].dt.strftime("%H:%M:%S") return all_comments_df
def main(arg): """ Initiation docstring """ credentials = {} credentials['CLIENT_ID'] = CLIENT_ID credentials['CLIENT_SECRET'] = CLIENT_SECRET credentials['USER_AGENT'] = USER_AGENT credentials['USERNAME'] = USERNAME credentials['PASSWORD'] = PASSWORD # (1) If you haven't saved your json file, use this template first! # with open("reddit_credentials.json", "w") as file: # json.dump(credentials, file) # (2) Use this after initializing your credentials with open("reddit_credentials.json", "r") as file: creds = json.load(file) # Initiate with credentials (you may have to comment this out when first initializing your json file) reddit = praw.Reddit(client_id=creds["CLIENT_ID"], client_secret=creds["CLIENT_SECRET"], user_agent=creds["USER_AGENT"], username=creds["USERNAME"], password=creds["PASSWORD"]) # read suicide-related keywords in csv df = pd.read_csv("suicide_keywords.csv", header=None, dtype=str, usecols=[i for i in range(1)], sep='+', encoding='latin-1') # dropping null value columns to avoid errors df.dropna(inplace=True) # process dataframe to be a list of keywords df[0] = df[0].astype(str) df[0] = df[0].str.strip() df[0] = df.apply(get_keywords, axis=1) df[0].apply(word_tokenize) df2 = pd.DataFrame(df[0].str.split(',').tolist()).stack() df2 = df2.reset_index() # SUBREDDIT(S) # Subreddit used: 'depression', 'suicidewatch', 'offmychest' TODO: singapore chosen_sub = arg subreddit = reddit.subreddit(chosen_sub) # Keywords with only unique values # remove empty values keywords = list( filter(None, set(df2[0].astype(str).values.flatten().tolist()))) # Stemmatize keywords to be applied porter = PorterStemmer() stem_keys = [porter.stem(w) for w in keywords if w.isalpha()] # Getting top up-voted topics of all time (can be any amount from .hot, .top, etc) top_submissions = subreddit.top(limit=80) ############################################# # TABLE DEFINITIONS (Users, User Submissions) 3NF ############################################# users_dict = {"author_id": [], # UserID "author": [], # User "submissions": [], # Title of submission "comment_karma": [], # Number of comment karma "link_karma": [], # Number of link karma "created": []} # Creation date and time user_submissions_dict = {"keyword": [], # keyword "author_id": [], # User ID "sub_id": [], # Sub ID "submission": [], # Title of submission "comments": [], # Comments of post "subreddit": [], # Subreddit of submission "created": []} # Daytime/Nighttime keyword_subs = defaultdict(list) # Sub ID and Keywords keyword_freq = defaultdict(list) # Keywords and Counter # Scraping variables # Each user (author) from the top posts saved to check their other submissions/replies top_post_users = [] riskzone_users = [] # Sentiment analysis variables results = [] sia = SIA() ############################################# ########## EVALUATION SUBMISSIONS ########### ############################################# # Check each submission from row 68 to get users for submission in top_submissions: # Stemmatize title stem_words = stemmatize(submission.title) # Iterate through all our keywords for word in list(set(stem_keys)): # Include posts where title any of keywords in it if word in stem_words: # Adds each user to a list if not already in list if submission.author not in top_post_users: top_post_users.append(submission.author) ############################################# ####### EVALUATING USERS & SUBMISSIONS ###### ############################################# # Iterate through every account from our users for account in top_post_users: user = reddit.redditor(str(account)) # Redditor object # ListingGenerator object containing submissions user_submissions = user.submissions.new() # Iterate through every submission that user has made. try: for submission in user_submissions: for keyword in list(set(stem_keys)): # Check if keyword exists in either title or body if keyword in stemmatize(submission.title): print(f'-- Stemmatized title: {stemmatize(submission.title)}') print(f'---- {keyword} existed in title.') # Sentiment analysis for each submission title pol_score = sia.polarity_scores(submission.title) pol_score['title'] = submission.title results.append(pol_score) # Add data to dictionary (in preparation for pandas to do its thing) user_submissions_dict["keyword"].append(keyword) user_submissions_dict["sub_id"].append(submission.id) user_submissions_dict["author_id"].append(user.id) user_submissions_dict["submission"].append( submission.title) user_submissions_dict["comments"].append( submission.num_comments) user_submissions_dict["subreddit"].append( submission.subreddit) user_submissions_dict["created"].append( submission.created_utc) # If user checks all flags: FLAG ACCOUNT AS RISKY if user not in riskzone_users: riskzone_users.append(user) # Add keywords found in specific sub keyword_subs["sub_id"].append(submission.id) keyword_subs["keyword"].append(keyword) except Forbidden: print(f'Ops! Forbidden: {Forbidden}') # Prepare keyword data frequency k_count_freq = get_keyword_frequency(keyword_subs["keyword"]) for k, val in k_count_freq.items(): keyword_freq["keyword"].append(k) keyword_freq["counter"].append(val) # ADDS UNIQUE ACCOUNTS AND THEIR DATA for user in riskzone_users: users_dict["author_id"].append(user.id) users_dict["author"].append(user) users_dict["submissions"].append(len(list(user.submissions.new()))) users_dict["comment_karma"].append(user.comment_karma) users_dict["link_karma"].append(user.link_karma) users_dict["created"].append(user.created_utc) # Convert dictionary to dataframe to make data more easily readable users_data = pd.DataFrame(users_dict) users_submissions_data = pd.DataFrame(user_submissions_dict) subs_keywords_data = pd.DataFrame(dict(keyword_freq)) # Fixing column date for users (unix time to actual time) _timestamp_users = users_data["created"].apply(get_date) users_data = users_data.assign(timestamp=_timestamp_users) _timestamp_users_submissions_data = users_submissions_data["created"].apply( get_date) users_submissions_data = users_submissions_data.assign( timestamp=_timestamp_users_submissions_data) # Adds column to see wether or not title is risky or not df = pd.DataFrame.from_records(results) users_submissions_data['risk'] = 0 users_submissions_data.loc[df['compound'] > 0.2, 'risk'] = 1 users_submissions_data.loc[df['compound'] < -0.2, 'risk'] = -1 # Check if directory exists directory = f'subreddits/{chosen_sub}/' if not os.path.exists(directory): os.makedirs(directory) with open(f'subreddits/{chosen_sub}/reddit_{chosen_sub}_users.csv', 'w+', encoding="utf-8", newline='') as file: users_data.to_csv(file, index=False) with open(f'subreddits/{chosen_sub}/reddit_{chosen_sub}_submissions.csv', 'w+', encoding="utf-8", newline='') as file: users_submissions_data.to_csv(file, index=False) with open(f'subreddits/{chosen_sub}/reddit_{chosen_sub}_keywords.csv', 'w+', encoding="utf-8", newline='') as file: subs_keywords_data.to_csv(file, index=False)
def aspect_segmentaion(file, aspect_file): #Sentiment analysis sid = SIA() #INPUT #review, this algo needs all the review. Please process dataset. reviews, all_ratings = load_file(file) #selection threshold p = 5 #Iterations # I = 10 I = 1 #Create Vocabulary review_sent, review_actual, only_sent = parse_to_sentence(reviews) vocab, vocab_dict = create_vocab(only_sent) #Aspect Keywords aspect_terms = get_aspect_terms(aspect_file, vocab_dict) label_text = [ 'Value', 'Rooms', 'Location', 'Cleanliness', 'Check in/Front Desk', 'Service', 'Business Service' ] # print aspect_terms #ALGORITHM review_labels = [] k = len(aspect_terms) v = len(vocab) aspect_words = np.zeros((k, v)) aspect_sent = np.zeros(k) num_words = np.zeros(v) for i in range(I): for r in review_sent: labels = [] for s in r: count = np.zeros(len(aspect_terms)) i = 0 for a in aspect_terms: for w in s: if w in vocab_dict: num_words[vocab_dict[w]] += 1 if w in a: count[i] += 1 i = i + 1 if max(count) > 0: la = np.where(np.max(count) == count)[0].tolist() labels.append(la) for i in la: aspect_sent[i] += 1 for w in s: if w in vocab_dict: aspect_words[i][vocab_dict[w]] += 1 else: labels.append([]) review_labels.append(labels) # aspect_w_rank = chi_sq_mat() # new_labels = [] # for na in aspect_w_rank: # x = np.argsort(na)[::-1][:p] # new_labels.append(x) # for k,v in vocab_dict.items(): # if vocab_dict[k] in x: # print k # print # sys.exit() ratings_sentiment = [] for r in review_actual: sentiment = [] #aspect ratings based on sentiment for s in r: ss = sid.polarity_scores(s) sentiment.append(ss['compound']) ratings_sentiment.append(sentiment) #Aspect Ratings Per Review aspect_ratings = [] for i, r in enumerate(review_labels): rating = np.zeros(7) count = np.zeros(7) rs = ratings_sentiment[i] for j, l in enumerate(r): for k in range(7): if k in l: rating[k] += rs[j] for k in range(7): if count[k] != 0: rating[k] /= count[k] #Map from -[-1,1] to [1,5] for k in range(7): if rating[k] != 0: rating[k] = int(round((rating[k] + 1) * 5 / 2)) aspect_ratings.append(rating) return aspect_ratings, all_ratings # n = 0 #print (review_actual[n], '\n', review_labels[n]) #print (ratings_sentiment[n], '\n', aspect_ratings[n]) #print (len(all_ratings), len(reviews), all_ratings[0]) # sys.exit() #print (sent[5:9], labels[5:9]) #print (zip(actual_sent, labels)[:10]) #print (zip(actual_sent, sentiment)[:10]) return aspect_ratings
from prices import crypto_history REMOVED = '[removed]' with open('./secrets.yaml', 'r') as secrets_file: data = yaml.load(secrets_file) client_id = data['reddit']['api']['client_id'] client_secret = data['reddit']['api']['client_secret'] reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='testscript by /u/thegreatwarlo') bitcoin_markets = reddit.subreddit('BitcoinMarkets') vader = SIA() def main(): # print(crypto_history.gather('20170101', '20170102', ['ethereum'])) with open('./data/bitcoin_markets_daily_discussion_v1.csv', 'a') as csvfile: writer = csv.writer(csvfile) writer.writerow( ['comment_id', 'date', 'body', 'parent_id', 'vader_scores']) for submission in get_daily_discussion(300): title = submission.title date = title.split(']')[-1].strip() print('\nSubmission Date: {0}'.format(date)) non_removed_comment_count = 0 submission.comments.replace_more(limit=None)
def analyze(soup_object): #select only the headlines in each google search result base = soup_object.select("div.g.card h3") #declare empty list where I'm going to put all the headlines headlines = [] #loop to get rid of all html and keep only headline text for row in base: clean = row.text headlines.append(clean) #print to verify headlines are clean #print(headlines) #empty list to store tokenized headlines tokens = [] #loop to tokenize headlines by using clean_tokens method from earlier for each in headlines: clean = clean_tokens(each) tokens.append(clean) #print tokens to verify #print('tokens =',tokens) #create stopwords list from nltk and add fallout 76 as stopwords stopwords = nltk.corpus.stopwords.words('english') stopwords.append("fallout") stopwords.append('76') stopwords.append("'fallout") #remove stopwords from tokens filtered = [] for list in tokens: x = [] for word in list: if word not in stopwords: x.append(word) filtered.append(x) #print to verify stopwords are gone #print("filtered = ", filtered) #declare empty list so I can put the tokens back into headlines without stopwords combined = [] #put tokens back into headlines without stopwords for list in filtered: combined.append(" ".join(list)) #import sentiment analyzer from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA #create sia object I think and then create empty list to put end results in sia = SIA() results = [] #analyze sentiment of each headline for line in combined: pol_score = sia.polarity_scores(line) results.append(pol_score) #print to verify #for i in results: #print(i) #write sentiment analysis to csv file with open("resultsback.csv", 'a') as csv_file: writer = csv.writer(csv_file) for d in results: writer.writerow(['compound', d['compound']])
def process_user_text(user_text, goal_category): #put user input text string into a DataFrame clean_data = pd.DataFrame(user_text, columns = ["text"]) clean_data["source"] = goal_category clean_data["subreddit"] = "placeholder" ## Starting with textstat textstat_results = pd.DataFrame(columns = ['flesch_ease', 'flesch_grade','gfog', 'auto_readability','cl_index','lw_formula','dcr_score', 'syll_count', 'lex_count']) for i in clean_data["text"]: results = textstat_stats(str(i)) #textstat needs a string textstat_results = textstat_results.append(results, ignore_index=True) #so that index is continuous # Resetting indices here may be unneccesary textstat_results = textstat_results.reset_index(drop=True) combined_data = pd.concat([clean_data, textstat_results], axis = 1) ## Moving on to NLTK part-of-speech tagging combined_data_wordtokens = [] for document in combined_data["text"]: tokens = nltk.word_tokenize(str(document)) combined_data_wordtokens.append(tokens) combined_data_wordpos = [] for document in combined_data_wordtokens: pos = nltk.pos_tag(document) #default is Penn Treebank tagset combined_data_wordpos.append(pos) pos_keys = ['#', '$', '“', '(', ')', ',', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS','LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$','RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN','VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '”'] pos_counts = [] for document in combined_data_wordpos: doc_length = len(document) mini_dict = Counter([pos for word,pos in document]) for pos in pos_keys: if pos not in mini_dict: mini_dict[pos] = 0 scaled_dict = {k: v / doc_length for k, v in mini_dict.items()} pos_counts.append(scaled_dict) pos_df = pd.DataFrame(pos_counts) pos_df = pos_df.fillna(0) combined_data = pd.concat([combined_data, pos_df], axis = 1) ## Add sentiment intensity sia = SIA() results = [] for line in combined_data["text"]: # this returns a list of dicts pol_score = sia.polarity_scores(line) pol_score['text'] = line results.append(pol_score) sia_neg = [] sia_pos = [] sia_neu = [] sia_comp = [] for document in results: neg = document['neg'] pos = document['pos'] neu = document['neu'] comp = document['compound'] sia_neg.append(neg) sia_pos.append(pos) sia_neu.append(neu) sia_comp.append(comp) combined_data["SIA_neg"] = sia_neg combined_data["SIA_pos"] = sia_pos combined_data["SIA_neu"] = sia_neu combined_data["SIA_com"] = sia_comp ## Now for the custom features Google_Curses = pd.read_csv("./insight2019/flask_app/my_flask/model/RobertJGabriel_Google_swear_words.txt", header = None) bad_words = Google_Curses[0].tolist() any_bad = [] for row in combined_data["text"]: if any(str(word) in str(row) for word in bad_words): any_bad.append(1) else: any_bad.append(0) combined_data["Google_curses"] = any_bad combined_data["Google_curses"].value_counts() emoji_counts = [] for row in combined_data["text"]: emoji_num = len(emoji_counter(str(row))) emoji_counts.append(emoji_num) combined_data["Num_emoji"] = emoji_counts combined_data["Num_emoji"].value_counts() internet_yelling = [] for row in combined_data["text"]: screams = scream_counter(str(row)) internet_yelling.append(screams) combined_data["Yell_count"] = internet_yelling return combined_data
def news(): if request.method == "POST": query = request.form.get("search") keyword = query pageSize = 100 lang = "en" apiKey = '673e46eca5794640b6c370ed313a5c80' sortBy = "popularity" head = "https://newsapi.org/v2/everything?q={}&pageSize={}&sortBy={}&language={}&apiKey={}".format( keyword, pageSize, sortBy, lang, apiKey) response = requests.get(head) response_json_string = json.dumps(response.json()) response_dict = json.loads(response_json_string) response_dict = json.loads(response_json_string) articles_list = response_dict['articles'] df_news = pd.read_json(json.dumps(articles_list)) if "Unnamed: 0" in df_news.columns: df_news.drop("Unnamed: 0", axis=1, inplace=True) df_news.dropna(subset=['title'], axis=0, inplace=True) df_news.reset_index(drop=True, inplace=True) sia = SIA() df_news['score'] = None df_news['label'] = 0 for i in range(len(df_news.index)): pol_score = sia.polarity_scores(df_news["title"][i]) df_news.iloc[i:i + 1, 8:9] = pol_score['compound'] df_news.loc[df_news['score'] > 0.2, 'label'] = 1 df_news.loc[df_news['score'] < -0.2, 'label'] = -1 df_news["cat"] = 0 df_news.reset_index(inplace=True) searched = query.lower() test_list = searched.split() for i in range(len(df_news.index)): if (df_news["description"][i] != None): if (bool([ ele for ele in test_list if (ele in (df_news["description"][i]).lower()) ])): df_news["cat"][i] = 1 if (df_news["title"][i] != None): if (bool([ ele for ele in test_list if (ele in (df_news["title"][i]).lower()) ])): df_news["cat"][i] = 1 for i in range(len(df_news.index)): if (df_news["description"][i] != None): if (searched in (df_news["description"][i]).lower()): df_news["cat"][i] = 2 if (df_news["title"][i] != None): if (searched in (df_news["title"][i]).lower()): df_news["cat"][i] = 2 df_news.sort_values(by=["cat"], ascending=False, inplace=True) news_pos = df_news[df_news['label'] == 1] news_neu = df_news[df_news['label'] == 0] news_neg = df_news[df_news['label'] == -1] pos_n = news_pos[[ 'title', 'description', 'url', 'source', 'author', 'score' ]].to_dict(orient='records') neg_n = news_neg[[ 'title', 'description', 'url', 'source', 'author', 'score' ]].to_dict(orient='records') neu_n = news_neu[[ 'title', 'description', 'url', 'source', 'author', 'score' ]].to_dict(orient='records') if (len(pos_n) > 15): news_display_pos = 15 else: news_display_pos = len(pos_n) if (len(neg_n) > 15): news_display_neg = 15 else: news_display_neg = len(neg_n) if (len(neu_n) > 15): news_display_neu = 15 else: news_display_neu = len(neu_n) return render_template("results_news.html", pos_n=pos_n, neg_n=neg_n, neu_n=neu_n, news_display_pos=news_display_pos, news_display_neg=news_display_neg, news_display_neu=news_display_neu) else: return render_template("news.html")
def youtube(): if request.method == "POST": os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1' service = get_authenticated_service() query = request.form.get("search") comments_from_youtube = search_videos_by_keyword(service, q=query, part='id,snippet', eventType='completed', type='video') df = pd.DataFrame(comments_from_youtube, columns=[ 'authorDisplayName', 'publishedAt', 'likeCount', 'viewerRating', 'totalReplyCount', 'authorChannelURL', 'comment' ]) #df = pd.read_csv('youtube_comments.csv') if "Unnamed: 0" in df.columns: df.drop("Unnamed: 0", axis=1, inplace=True) df.dropna(subset=["comment"], axis=0, inplace=True) df.reset_index(drop=True, inplace=True) df["pos"] = None df["neu"] = None df["neg"] = None df["comp"] = None df["label"] = 0 sia = SIA() for i in range(len(df.index)): pol_score = sia.polarity_scores(df["comment"][i]) df.iloc[i:i + 1, 7:8] = pol_score['pos'] df.iloc[i:i + 1, 8:9] = pol_score['neu'] df.iloc[i:i + 1, 9:10] = pol_score['neg'] df.iloc[i:i + 1, 10:11] = pol_score['compound'] if (pol_score['compound'] > 0.2): df.iloc[i:i + 1, 11:12] = 1 elif (pol_score['compound'] < -0.2): df.iloc[i:i + 1, 11:12] = -1 to_drop = df[df["label"] == 0].index df.drop(to_drop, inplace=True) df.reset_index(drop=True, inplace=True) df['likeCount-normalized'] = df['likeCount'] / df['likeCount'].max() df["SCORE"] = df['likeCount-normalized'] * df['comp'] df.sort_values(by=['SCORE'], ascending=False, inplace=True) to_drop = df[df["SCORE"] == 0].index df.drop(to_drop, inplace=True) df.reset_index(drop=True, inplace=True) df["SCORE"] = df['SCORE'].map(lambda x: round(x, 5)) df["cat"] = 0 df.reset_index(inplace=True) searched = query.lower() test_list = searched.split() for i in range(len(df.index)): if (df["comment"][i] != None): if (bool([ ele for ele in test_list if (ele in (df["comment"][i]).lower()) ])): df["cat"][i] = 1 for i in range(len(df.index)): if (df["comment"][i] != None): if (searched in (df["comment"][i]).lower()): df["cat"][i] = 2 df.sort_values(by=["cat"], ascending=False, inplace=True) df_pos = df[df["label"] == 1] df_neg = df[df["label"] == -1] #df_neg.sort_values(by=['SCORE'],ascending=True,inplace=True) pos_yt = df_pos[[ 'authorDisplayName', 'comment', 'SCORE', 'authorChannelURL' ]].to_dict(orient='records') neg_yt = df_neg[[ 'authorDisplayName', 'comment', 'SCORE', 'authorChannelURL' ]].to_dict(orient='records') if (len(pos_yt) > 15): to_display_pos = 15 else: to_display_pos = len(pos_yt) if (len(neg_yt) > 15): to_display_neg = 15 else: to_display_neg = len(neg_yt) return render_template("results_youtube.html", pos_yt=pos_yt, neg_yt=neg_yt, to_display_pos=to_display_pos, to_display_neg=to_display_neg) else: return render_template("youtube.html")
def get_world_politics(): sia = SIA() headlines = [] post_score = [] polarity_score = [] pos_list = [] neg_list = [] request = make_international_request_using_cache(world_politics_url) json_data = json.loads(request) data_all = json_data['data']['children'] num_of_posts = 0 while len(data_all) <= 1000: time.sleep(2) last = data_all[-1]['data']['name'] url = 'https://www.reddit.com/r/worldpolitics/.json?after=' + str(last) req = make_international_request_using_cache(url) data = json.loads(req) data_all += data['data']['children'] if num_of_posts == len(data_all): break else: num_of_posts = len(data_all) for post in data_all: headlines.append(post['data']['title']) post_score.append(post['data']['score']) res = sia.polarity_scores(post['data']['title']) polarity_score.append(res['compound']) if res['compound'] > 0.2: pos_list.append(post['data']['title']) elif res['compound'] < -0.2: neg_list.append(post['data']['title']) with open("pos_news_titles_international.txt", "w", encoding='utf-8', errors='ignore') as f_pos: for post in pos_list: f_pos.write(post + "\n") with open("neg_news_titles_international.txt", "w", encoding='utf-8', errors='ignore') as f_neg: for post in neg_list: f_neg.write(post + "\n") try: conn = sqlite3.connect(DBNAME) cur = conn.cursor() except Error as e: print(e) # International Table: ID, Headline, Alpha2, Polarity Score, Upvote Score for post in data_all: insertion = (None, post['data']['title'], None, None, post['data']['score']) statement = 'INSERT INTO "International" ' statement += 'VALUES (?, ?, ?, ?, ?)' cur.execute(statement, insertion) for post in polarity_score: cur.execute('INSERT INTO International (Polarity) VALUES (?)', (post, )) conn.commit() conn.close()
def setup_sia(): # Sentiment Intensity Analyser sia = SIA() populate_dictionary(sia) return sia
def gatherFeaturesAndMakeSIA(start_date, end_date, showPlots): print("Gathering csv data for training. Please wait...") pd.options.mode.chained_assignment = None # default='warn' # Get the market info for Bitcoin bitcoin_market_info = pd.read_html( "https://coinmarketcap.com/currencies/bitcoin/historical-data/?start=20130428&end=" + time.strftime("%Y%m%d"))[0] bitcoin_market_info = bitcoin_market_info.assign( Date=pd.to_datetime(bitcoin_market_info['Date'])) bitcoin_market_info.loc[bitcoin_market_info['Volume'] == "-", 'Volume'] = 0 bitcoin_market_info['Volume'] = bitcoin_market_info['Volume'].astype( 'int64') market_info = bitcoin_market_info # filter the requested data market_info = market_info[market_info['Date'] <= end_date] market_info = market_info[market_info['Date'] >= start_date] # calculate the no of days in between the given dates datetime_object = datetime.strptime(start_date, '%Y-%m-%d') datetime_object1 = datetime.strptime(end_date, '%Y-%m-%d') tmp = datetime_object1 - datetime_object days_in_between = tmp.days + 1 # print('Days in beween the given dates - ' + str(days_in_between)) # Create Day Diff column with values kwargs = { 'Day Diff': lambda x: (x['Open*'] - x['Close**']) / x['Open*'] * 10000 } market_info = market_info.assign(**kwargs) # Create Close Off High and Volatility column kwargs = { 'Close Off High': lambda x: ((2 * (x['High'] - x['Close**'])) / (x['High'] - x['Low']) - 1) * 10000, 'Volatility': lambda x: (x['High'] - x['Low']) / (x['Open*']) } market_info = market_info.assign(**kwargs) # add Movement column (initially random values) market_info = market_info.assign( Movement=pd.Series(np.random.randn(len(market_info['Open*']))).values) # reverse array for ascending order market_info = market_info.sort_values(by='Date') # Resetting the indexes market_info = market_info.reset_index(drop=True) # Replace Movement random values with actual values -> UP or DOWN i = 0 while i < ( len(market_info) - 1): # -1 because we cannot compare the last day with the next one market_info['Movement'][ i] = market_info['Close**'][i] - market_info['Close**'][i + 1] x = market_info['Movement'][i] if (x > 0): market_info['Movement'][i] = 'Down' elif (x < 0): market_info['Movement'][i] = 'Up' i = i + 1 # drop rest keep only what we need model_data = market_info[[ 'Date', 'Close**', 'Volume', 'Close Off High', 'Day Diff', 'Volatility', 'Movement' ]] # Dropping the last row that doesnt have movement model_data = model_data.drop(model_data.index[len(model_data) - 1]) # print(model_data.to_string()) # ----------------------------------------from 3 April ->June 2-------------------------------------------------------- sia = SIA() # ------------------------SIA-------------------------------------- fileCounter = days_in_between sentimentList = [] list_dates = market_info['Date'] list_prices = market_info['Close**'] posValues = [] negValues = [] neutralValues = [] # do not read file number 1 which is 2 June, because it was dropped before while fileCounter > 1: with open('redditCommentsBitcoinMarkets 3 April - 2 June/' + str(fileCounter), 'r', encoding='utf-8', errors='ignore') as file: neutralCounter = 0 positiveCounter = 0 negativeCounter = 0 for line in file: res = sia.polarity_scores(line) if res['compound'] > 0.2: positiveCounter += 1 elif res['compound'] < -0.2: negativeCounter += 1 else: neutralCounter += 1 posValues.append(positiveCounter) negValues.append(negativeCounter) neutralValues.append(neutralCounter) fileCounter -= 1 totalSentiment = positiveCounter + negativeCounter + neutralCounter sentimentList.append( ((positiveCounter - negativeCounter) / totalSentiment) * 1000) if showPlots: # ---------------plot for SIA---------------------------------------------- plt.plot(list_dates, posValues, color='g', label='positive') plt.plot(list_dates, negValues, color='red', label='negative') plt.plot(list_dates, neutralValues, color='orange', label='neutral') plt.plot(list_dates, list_prices, color='blue', label='price') plt.gcf().autofmt_xdate() plt.legend() plt.xlabel('Time line') plt.ylabel('No. of comments') plt.title('Positive, negative & neutral sentiment over time') plt.show() # ------------------------SIA2-------------------------------------- fileCounter = days_in_between sentimentList2 = [] posValues = [] negValues = [] neutralValues = [] # do not read file number 1 which is 2 June, because it was dropped before while fileCounter > 1: with open('redditCommentBitcoin 3 April - 2 June/' + str(fileCounter), 'r', encoding='utf-8', errors='ignore') as file: neutralCounter = 0 positiveCounter = 0 negativeCounter = 0 for line in file: res = sia.polarity_scores(line) if res['compound'] > 0.2: positiveCounter += 1 elif res['compound'] < -0.2: negativeCounter += 1 else: neutralCounter += 1 posValues.append(positiveCounter) negValues.append(negativeCounter) neutralValues.append(neutralCounter) fileCounter -= 1 totalSentiment = positiveCounter + negativeCounter + neutralCounter sentimentList2.append( ((positiveCounter - negativeCounter) / totalSentiment) * 1000) if showPlots: # ---------------plot for SIA2---------------------------------------------- plt.plot(list_dates, posValues, color='g', label='positive') plt.plot(list_dates, negValues, color='red', label='negative') plt.plot(list_dates, neutralValues, color='orange', label='neutral') plt.plot(list_dates, list_prices, color='blue', label='price') plt.gcf().autofmt_xdate() plt.legend() plt.xlabel('Time line') plt.ylabel('No. of comments') plt.title('Positive, negative & neutral sentiment over time') plt.show() with open('train.csv', 'w') as csvfile: fieldnames = [ 'Date', 'Close**', 'Volume', 'Close Off High', 'Day Diff', 'Volatility', 'SIA', 'SIA2', 'Movement' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() i = 0 while i < (len(market_info) - 1): writer.writerow({ 'Date': model_data['Date'][i], 'Close**': model_data['Close**'][i], 'Volume': model_data['Volume'][i], 'Close Off High': model_data['Close Off High'][i], 'Day Diff': model_data['Day Diff'][i], 'Volatility': model_data['Volatility'][i], 'SIA': sentimentList[i], 'SIA2': sentimentList2[i], 'Movement': model_data['Movement'][i] }) i = i + 1 print("Finished train data set! Output -> train.csv.")
def get_features(track_id): features_results = sp.audio_features([track_id]) json_features = json.dumps(features_results) features_data = json.loads(json_features) # Convert features dictionary to a list features_list = list(features_data[0].values()) return features_list client_credentials_manager = SpotifyClientCredentials() sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) sentiment_analyzer = SIA() # IDs of monthly playlists from November 2016 to November 2017 playlist_ids = [ "07zqCIPCroFMKSajvERGvE", "30PgYnoeT2PAgFNuYLR5qd", "1vS1nakUrLYkTd3W0yRMYe", "3scPGVlAn7d74uXRtFnmUC", "5LzBRPVAPYUssZ4ZESnRmH", "6hDHXewz8qBTezvONSqzyl", "00riJCYiVJ1yptAXtv2h6k", "0HxFI5dOlKztf38T9sa0cF", "7EFWm7Mjy6GLJHOEgKEblM", "6YAG0Li1BoUkmhc8iycY6l", "7Iw0yI71QX59zyFq0kAZTS", "69XTCqVzbSWPMLucSvzlLl", "7pRnKuQMkmntEj7Nnj94r0" ] # Audio features feature_names = [ "danceability", "energy", "key", "loudness", "mode", "speechiness",
def update_tweets(tweets_to_check, collection, language, searchQuery, constituent, st): sia = SIA() tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=False) list_of_tweets = [] for tweet in tweets_to_check: doc = tweet._json if collection.find({"id_str": doc["id_str"]}, {"id_str": 1}).limit(1): continue list_of_tweets.append(doc) if language != "en": if not do_translation(list_of_tweets): if not do_translation(list_of_tweets): return None for document in list_of_tweets: if language == 'en': document.update(preprocess_tweet(document['text'])) else: document.update(preprocess_tweet(document['text_en'])) document['search_term'] = searchQuery document['constituent'] = constituent document['language'] = language date = datetime.strptime(document['created_at'], '%a %b %d %H:%M:%S %z %Y') document['date'] = date #Update sentiment sentiment_score, sentiment = get_nltk_sentiment( document["semi_processed_text"], sia) document["nltk_sentiment_score"] = sentiment document["nltk_sentiment_number"] = sentiment_score #Update tags document["tag_LOCATION"] = list() document["tag_PERSON"] = list() document["tag_ORGANIZATION"] = list() document["tag_MONEY"] = list() document["tag_PERCENT"] = list() document["tag_DATE"] = list() document["tag_TIME"] = list() if language == 'en': text = document['text'] else: text = document['text_en'] for word, tag in get_tags(text, st, tokenizer): if tag != "O": document["tag_" + tag].append(word) if 'retweeted_status' in document: document.pop('retweeted_status', None) return list_of_tweets
def main(): # Authentication api = get_api_authentication() assert api # ----- Data storing ----- # Users and their information users = { "user_id": [], # .id "username": [], # .screen_name "location": [], # .location "created": [], # .created_at "followers": [], # .followers_count "following": [], # .friends_count "tweets": [] # .statuses_count } # Tweets and its information tweets = { "user_id": [], # .user.id "tweet_id": [], # .id "tweet": [], # .text/.full_text (normal/extended tweet) "is_retweet": [], # True/False "created": [], # .created_at "retweets": [], # .retweet_count "likes": [] # .favorite_count } tweet_keyword_counter = defaultdict(list) tweet_keyword_freq = defaultdict(list) # List definitions user_ids = [] hashtags = [] results = [] # SIA Initializer sia = SIA() # Get dataframe with keywords df = retrieve_keyword_dataframe() # Process dataframe to be a list of keywords df2 = process_dataframe(df) # Keywords with only unique values keywords = list( filter(None, set(df2[0].astype(str).values.flatten().tolist()))) # main algorithm (getting users from initial search) (2 tweets per keyword) for word in keywords: for status in tweepy.Cursor(api.search, q=word, tweet_mode='extended', lang="en").items(2): # Add to user account list to then iterate through if status.user.id not in user_ids: user_ids.append(status.user.id) # Check all users tweets # 1. For every account id that we've saved from initial search, add user # 2. For every tweet in that account (with a limit), add tweet # 3. For every keyword in our wordlist # 4. If that keyword exist in the current tweet: save information. for acc_id in user_ids: user = api.get_user(acc_id) # Add user information if user.id not in users["user_id"]: users["user_id"].append(user.id) users["username"].append(user.screen_name) users["location"].append( user.location) #TODO: Check if correct location before adding users["created"].append(user.created_at) users["followers"].append(user.followers_count) users["following"].append(user.friends_count) users["tweets"].append(user.statuses_count) ret = 0 for status in tweepy.Cursor(api.user_timeline, screen_name=user.screen_name, tweet_mode='extended').items(500): for word in keywords: if word in status.full_text: # Polarity score pol_score = sia.polarity_scores(status.full_text) pol_score['tweet'] = status.full_text results.append(pol_score) # Add tweet information if status.id not in tweets["tweet_id"]: tweets["user_id"].append(status.user.id) tweets["tweet_id"].append(status.id) tweets["tweet"].append(status.full_text) if hasattr(status, 'retweeted_status'): tweets["is_retweet"].append(True) ret += 1 else: tweets["is_retweet"].append(False) tweets["created"].append(status.created_at) tweets["retweets"].append(status.retweet_count) tweets["likes"].append(status.favorite_count) # Add hashtags if exists if hasattr(status, "entities"): if "hashtags" in status.entities: hashtag = [ ent["text"] for ent in status.entities["hashtags"] if "text" in ent and ent is not None ] if hashtag is not None: hashtags.append(hashtag) # Add keywords and their counters tweet_keyword_counter["tweet_id"].append(status.id) tweet_keyword_counter["keyword"].append(word) tweet_keyword_counter["tweet_time"].append( status.created_at) acc += 1 # Get keyword and each frequency k_count_freq = calculate_frequency(tweet_keyword_counter["keyword"]) for k, val in k_count_freq.items(): tweet_keyword_freq["keyword"].append(k) tweet_keyword_freq["counter"].append(val) # PANDAS AND CSV WRITING users_df = pd.DataFrame(users) tweets_df = pd.DataFrame(tweets) keys_df = pd.DataFrame(tweet_keyword_freq) # Adds column to see wether or not title is risky or not df = pd.DataFrame.from_records(results) tweets_df['risk'] = 0 tweets_df.loc[df['compound'] > 0.2, 'risk'] = 1 tweets_df.loc[df['compound'] < -0.2, 'risk'] = -1 # Users with open(f'tweets/twitter_users.csv', 'w+', encoding="utf-8", newline='') as file: users_df.to_csv(file, index=False) # Tweets with open(f'tweets/twitter_tweets.csv', 'w+', encoding="utf-8", newline='') as file: tweets_df.to_csv(file, index=False) # Keywords with datetime with open(f'tweets/twitter_keywords.csv', 'w+', encoding='utf-8', newline='') as file: keys_df.to_csv(file, index=False)
def calculate_sentiment(text): sia = SIA() return sia.polarity_scores(text)
plt.show() # Sentiment Analysis Submissions sns.set(style='darkgrid', context='talk', palette='Blues_d') headlines = set() for submission in reddit.subreddit('Bitcoin').top(limit=1000): headlines.add(submission.title) display.clear_output() print(len(headlines)) from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA sia = SIA() results = [] for line in headlines: ripple_score = sia.polarity_scores(line) ripple_score['headline'] = line results.append(ripple_score) pprint(results[:3], width=100) df = pd.DataFrame.from_records(results) df.head() df['label'] = 0
def polarity_calculator(cls, string_data): # takes in string sia = SIA() strng = cls.cleaner(string_data) pol_score = sia.polarity_scores((strng)) pol_score['article'] = string_data return pol_score #returns dict
q1_df = yelp[yelp["date"] > frame_q1] q2_df = yelp[(yelp["date"] < (frame_q1 + timedelta(60))) & (yelp["date"] > (frame_q2 + timedelta(60)))] for frame, typer in zip([q1_df, q2_df], ["q1", "q2"]): yelp = frame from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA yelp["positive"] = 0 yelp["compound"] = 0.0 yelp["negative"] = 0 yelp["neutral"] = 0 analyzer = SIA() for sentence, row in zip(yelp["review"], list(range(yelp.shape[0]))): vs = analyzer.polarity_scores(sentence) yelp["compound"][row] = float(vs["compound"]) if vs["compound"] < -0.5: yelp["negative"][row] = 1 elif vs["compound"] > 0.5: yelp["positive"][row] = 1 else: yelp["neutral"][row] = 1 # print("{:-<65} {}".format(sentence, str(vs))) worst = yelp[(yelp["rating"] < 3) & (yelp["compound"] < -.3)] worst = worst.sort_values("date", ascending=False).reset_index() best = yelp[(yelp["rating"] > 2) & (yelp["compound"] > 0)]
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA sia = SIA() results = [] headlines = [ "I am suffering from fever", "I am having a headache", "kill those bastards", "I want to strike an airplane to new WTC", "music fever", "My kid is suffering from fever", "meseals is growing fast", "सर दबग लोगों से मेरे जानमाल की सुरक्षा करा । इन दबग लोगो के सामने आज शास्त्री पार्क थाने कि पुलिस भी बेबस…", "I’m going to kill myself ... mariachi band playing in the restaurant when I have a headache", "I had a headache today so I couldn’t go anywhere and do much. I laced in bed 75% of the time.", "Wake up with a sinus headache aaand the guy across the street cutting the grass (from before 7)... Let's restart this day eh?" ] for line in headlines: pol_score = sia.polarity_scores(line) pol_score['headline'] = line results.append(pol_score) for i in results: print(i)
def sentiment_analysis(message): sia = SIA() p_score = sia.polarity_scores(message) p = p_score['compound'] speedometer(p)