from nltk.sentiment.vader import SentimentIntensityAnalyzer import pandas as pd from twitter_get_data import query_str import os path = os.getcwd() os.chdir(query_str) data = pd.read_csv('Predicted_Emotions.csv', encoding='ISO-8859-1') sentiment_scores = [] sentiment_type = [] sie = SentimentIntensityAnalyzer() for i in range(len(data)): returned_tweet = data.iloc[:, 0][i] if (not pd.isnull(returned_tweet)): senti_score = sie.polarity_scores(returned_tweet)['compound'] sentiment_scores.append(senti_score) if (senti_score > 0.0): sentiment_type.append('Positive') elif (senti_score < 0.0): sentiment_type.append('Negative') else: sentiment_type.append('Neutral') else: sentiment_scores.append(0.0) sentiment_type.append('Neutral') data.loc[:, 'Sentiment'] = sentiment_scores data.loc[:, 'Sentiment_Type'] = sentiment_type data.to_csv('Final_Results.csv', index=None)
def write(): import streamlit as st #datetime is imported so that the user's [entry, date] pair can be saved from datetime import datetime import nltk as nltk import joblib import math as math nltk.download('vader_lexicon') from nltk.sentiment.vader import SentimentIntensityAnalyzer import pandas as pd sid = SentimentIntensityAnalyzer() import scipy import torch import re #import sklearn from scipy import spatial from sentence_transformers import SentenceTransformer @st.cache(allow_output_mutation=True) def load_my_model(): model = SentenceTransformer('distilbert-base-nli-mean-tokens') return model from transformers import pipeline #@st.cache(allow_output_mutation=True) #def load_classifier(): # classifier = pipeline('sentiment-analysis') # return classifier @st.cache(allow_output_mutation=True) def load_isear(): isear = pd.read_csv("isear_embed.csv") isear = isear.drop("index", axis=1) return isear @st.cache def analysis(sentence): model = load_my_model() lis = list() m = sid.polarity_scores(sentence) score = m['compound'] a = re.split("[.!?;\n]", sentence) if len(a) > 2: b = a[len(a) - 2] + ". " + a[len(a) - 1] c = sid.polarity_scores(b) score = c['compound'] if len(a) > 3: b = a[len(a) - 3] + ". " + a[len(a) - 2] + ". " + a[len(a) - 1] c = sid.polarity_scores(b) score2 = c['compound'] else: score2 = 0 EHS = pd.read_csv("EHS.csv") sentence_embeddings = EHS.values.tolist() OPTO = pd.read_csv("OPTO.csv") optimistic_embeddings = OPTO.values.tolist() #should check what happens when i do .values.tolist() a nd why i do it isear = load_isear() isear_list = isear.values.tolist() booleon = 0 a_embeddings = model.encode(a) for j in range(len(a_embeddings)): for i in range(len(sentence_embeddings)): result = 1 - spatial.distance.cosine(sentence_embeddings[i], a_embeddings[j]) if result > .8: booleon = booleon - 1 #print(a[j]) #st.write('You sound helpless, this sentence concerned me:', a[j]) break for j in range(len(a_embeddings)): for i in range(len(optimistic_embeddings)): result = 1 - spatial.distance.cosine(optimistic_embeddings[i], a_embeddings[j]) if result > .8: booleon = booleon + 1 break rent = (booleon / len(a_embeddings)) isear_feature = 0 for j in range(len(a_embeddings)): for i in range(len(isear_list)): result = 1 - spatial.distance.cosine(isear_list[i], a_embeddings[j]) if result >= .8: isear_feature = isear_feature - 1 break hugscore = 0 classifier = pipeline('sentiment-analysis') if len(a) > 3: for i in range(0, len(a)): result = classifier(a[i]) result = pd.DataFrame(result) if str(result["label"]).count("POS") > 0: hugscore = hugscore + result['score'] if str(result["label"]).count("NEG") > 0: hugscore = hugscore - result['score'] hugscore = hugscore / len(a) hugscore = float(hugscore) lis.append([rent, isear_feature, score, score2, hugscore]) return lis sentence = st.text_area("what's on your mind?") #button = st.button() #the reason score is compute here and not inside st.button("analysis") is because now it'll be saved rather than refreshed if another button gets pressed #basically, variables inside a button aren't available outside of them. #need to append more than that to the list to get meaningful data out of this. if len(sentence) > 1: if sentence.count(".") == 0: st.write("Write more!") else: df = analysis(sentence) df = pd.DataFrame(df) df.columns = [ "rent", "isear_feature", "score", "score3", "hugscore" ] loaded_model = joblib.load("GradientBoostedClassifier90CV.sav") result = loaded_model.predict(df) if result[0] == 0: score = "pessimistic" booleon = -3 if result[0] == 1: score = "neutral" booleon = 0 if result[0] == 2: score = "optimistic" booleon = 3 #try: # lis.append([df[0]]) #except: # lis = list() # lis.append([df[0]]) #need to revise output. Output should be a page of resources with a gif on top. if st.button('Analysis'): #gonna change this to if sentence.count(x) + count(y) .... < 5, then ask them to write more. #the model does poorly on samples less than 5 sentences if len(sentence) > 1: if sentence.count(".") + sentence.count("!") + sentence.count( "?") < 5: st.write( "I'm not smart enough to analyze this without more sentences :(" ) st.markdown( "![Alt Text](https://media1.tenor.com/images/cedbc086995947a3e2c239f13a3661b4/tenor.gif?itemid=11992490)" ) elif sentence.count("..") + sentence.count("!!") > 2: st.write( "I can't analyze entries right now that have abnormal punctuation. Feel free to change your punctuation and try again." ) st.markdown( "![Alt Text](https://media1.tenor.com/images/59f338103063f0c10ee1163d48b4dd14/tenor.gif?itemid=17312786)" ) else: st.write("you're feeling : " + score) if score == "pessimistic": st.write("That's fine. Let it all out.") st.markdown( "![Alt Text](https://media.tenor.com/images/ff4a60a02557236c910f864611271df2/tenor.gif)" ) st.write( "Check out the resources tab to see how you can 'learn' optimism" ) st.markdown( "[Click here if you need extra help](https://suicidepreventionlifeline.org/chat/)" ) if score == "neutral": st.write( "You're just chilling. Waiting on some stuff to play out. It be like that sometimes." ) st.markdown( "![Alt Text](https://media1.tenor.com/images/0fbf51f99bccd97a825d11cb4487ce85/tenor.gif?itemid=11015213)" ) if score == "optimistic": st.write("You are a ray of sunshine today! Keep it up!") st.markdown( "![Alt Text](https://media.tenor.com/images/2aa9b6f3a7d832c2ff1c1a406d5eae73/tenor.gif)" ) #st.header("Insert your username below to save your score") username = st.text_input( "Username (required for you to save your score & see your day-to-day changes): " ) today = datetime.now() #st.text_input doesn't work inside the st.button()....gotta figure out why #^above is an old note, i know why now, I just keep it there to remind me that inside button actions are way diff than outside button actions if st.button('Save my score'): import csv fields = [result[0], sentence, today] try: test = open(username + ".csv", 'r') with open(username + ".csv", 'a') as f: writer = csv.writer(f) writer.writerow(fields) except FileNotFoundError: with open(username + ".csv", 'a') as f: writer = csv.writer(f) writer.writerow(["score", "sentence", "date"]) writer.writerow(fields)
def main(): spark, sc = init_spark() # Read US.Metadata Json File ps = sc.wholeTextFiles(r"reference\us_state_meta_latest.json").values().map(json.loads) broadcastStates.append(spark.sparkContext.broadcast(ps.map(lambda x: x).collect()).value) # print(broadcastStates) # print(ps.map(lambda x: x).collect()) # for i in broadcastStates: # for j in i: # print(j) # Read Geo True Json file geo_true_df = spark.read.json("data\geo_true.json") # print(geo_true_df.printSchema()) # print(geo_true_df.show(truncate=False)) # Read Geo True Json file # geo_false_df = spark.read.json("data\geo_false.json") #.repartition(100) # print(geo_false_df.printSchema()) # print(geo_false_df.show(truncate=False)) geo_true_df = geo_true_df.filter(geo_true_df.country_code == 'US') geo_true_df = geo_true_df.drop('_id', 'coordinates', 'country_code').withColumnRenamed("city_state", "location") geo_true_df = geo_true_df.withColumn("new_location", F.lower(F.col("location"))) geo_true_df = geo_true_df.withColumn('new_location', regexp_replace('new_location', '^[a-zA-Z\']+', ' ')) # print(geo_true_df.show(truncate=False)) # print(geo_true_df.count()) # Register UDF geo_udf = udf(lambda x: get_new_us_code(x), StringType()) geo_true_df = geo_true_df.withColumn('new_location', geo_udf('new_location')) # print(geo_true_df.show(truncate=False)) """geo_false_df = geo_false_df.drop('lang', '_id') print(geo_false_df.show(truncate=False)) print(geo_false_df.count()) geo_false_df = geo_false_df.withColumn("new_location", F.lower(F.col("location"))) geo_false_df = geo_false_df.withColumn('new_location', regexp_replace('new_location', '^[a-zA-Z\']+', ' ')) geo_false_df = geo_false_df.withColumn('new_location', geo_udf('new_location')) print(geo_false_df.show(truncate=False)) print(geo_false_df.count()) df = unionAll(geo_true_df, geo_false_df).distinct().show() print(df.show(truncate=False))""" # tweets = df tweets = geo_true_df # print("Total Tweets \t\t\t\t: ", tweets.count()) # joeBiden tweets excluding trump joe_only = tweets.filter( (tweets['text'].rlike("[Jj]oe|[Bb]iden") == True) & (tweets['text'].rlike("[Dd]onald|[Tt]rump") == False)) # print("Only Joe Biden Tweets \t\t\t: ", joe_only.count()) trump_only = tweets.filter( (tweets['text'].rlike("[Jj]oe|[Bb]iden") == False) & (tweets['text'].rlike("[Dd]onald|[Tt]rump") == True)) # print("Only Donald Trump Tweets \t\t: ", trump_only.count()) joe_and_trump = tweets.filter( (tweets['text'].rlike("[Dd]onald|[Tt]rump")) & (tweets['text'].rlike("[Jj]oe|[Bb]iden"))) # print("Both Joe_Biden & Trump Tweets \t\t: ", joe_and_trump.count()) not_joe_trump = tweets.filter( ~(tweets['text'].rlike("[Dd]onald|[Tt]rump")) & ~(tweets['text'].rlike("[Jj]oe|[Bb]iden"))) # print("Tweets without Joe_Biden & Trump \t: ", not_joe_trump.count()) sid = SentimentIntensityAnalyzer() udf_priority_score = udf(lambda x: sid.polarity_scores(x), returnType=FloatType()) # Define UDF function udf_compound_score = udf(lambda score_dict: score_dict['compound']) udf_comp_score = udf(lambda c: 'pos' if c >= 0.05 else ('neu' if (c > -0.05 and c < 0.05) else 'neg')) trump_only = trump_only.withColumn('scores', udf_priority_score(trump_only['text'])) trump_only = trump_only.withColumn('compound', udf_compound_score(trump_only['scores'])) trump_only = trump_only.withColumn('comp_score', udf_comp_score(trump_only['compound'])) joe_only = joe_only.withColumn('scores', udf_priority_score(joe_only['text'])) joe_only = joe_only.withColumn('compound', udf_compound_score(joe_only['scores'])) joe_only = joe_only.withColumn('comp_score', udf_comp_score(joe_only['compound'])) # print(trump_only.show(truncate=False)) # print(joe_only.show(truncate=False)) joe_pos_only = joe_only[joe_only.comp_score == 'pos'] joe_neg_only = joe_only[joe_only.comp_score == 'neg'] joe_neu_only = joe_only[joe_only.comp_score == 'neu'] trump_pos_only = trump_only[trump_only.comp_score == 'pos'] trump_neg_only = trump_only[trump_only.comp_score == 'neg'] trump_neu_only = trump_only[trump_only.comp_score == 'neu'] # print("Total Trump Tweets \t\t: ", trump_only.count()) # print("Positive Trump Tweets \t\t: ", trump_pos_only.count()) # print("Negative Trump Tweets \t\t: ", trump_neg_only.count()) # print("Neutral Trump Tweets \t\t: ", trump_neu_only.count()) # print("Total Biden Tweets \t\t: ", joe_only.count()) # print("Positive Biden Tweets \t\t: ", joe_pos_only.count()) # print("Negative Biden Tweets \t\t: ", joe_neg_only.count()) # print("Neutral Biden Tweets \t\t: ", joe_neu_only.count()) joe_pos_neg_only = joe_only.filter(joe_only['comp_score'] != 'neu') trump_pso_neg_only = trump_only.filter(trump_only['comp_score'] != 'neu') # print("Total Trump Pos & Neg Tweets Only \t\t: ", trump_pso_neg_only.count()) # print("Total Biden Pos & Neg Tweets Only \t\t: ", joe_pos_neg_only.count()) dt1 = joe_only.groupBy(F.col('location')).agg(F.count('location').alias('joe_total')) dt2 = joe_pos_only.groupBy(F.col('location')).agg(F.count('location').alias('joe_pos')) dt3 = joe_neg_only.groupBy(F.col('location')).agg(F.count('location').alias('joe_neg')) dt4 = trump_only.groupBy(F.col('location')).agg(F.count('location').alias('trump_total')) dt5 = trump_pos_only.groupBy(F.col('location')).agg(F.count('location').alias('trump_pos')) dt6 = trump_neg_only.groupBy(F.col('location')).agg(F.count('location').alias('trump_neg')) # print(dt1.show(truncate=False)) # print(dt2.show(truncate=False)) # print(dt3.show(truncate=False)) # print(dt4.show(truncate=False)) # print(dt5.show(truncate=False)) # print(dt6.show(truncate=False)) # print(dt1.count()) # print(dt2.count()) dfs = [dt1, dt2, dt3, dt4, dt5, dt6] df_final = reduce(lambda left, right: DataFrame.join(left, right, on='location'), dfs) df_final = df_final.sort(F.col('joe_total').asc()) # print(df_final.show(truncate=False)) df_per = df_final df_per = df_per.withColumn('Joe Pos %', ((df_final['joe_pos'] / df_final['joe_total']) * 100)) df_per = df_per.withColumn('Joe Neg %', ((df_final['joe_neg'] / df_final['joe_total']) * 100)) df_per = df_per.withColumn('Trump Pos %', ((df_final['trump_pos'] / df_final['trump_total']) * 100)) df_per = df_per.withColumn('Trump Neg %', ((df_final['trump_neg'] / df_final['trump_total']) * 100)) df_per = df_per.withColumn("prediction", when((df_per['Joe Pos %'] > df_per['Trump Pos %']) , "Biden"). when((df_per['Joe Pos %'] < df_per['Trump Pos %']) , "Trump").otherwise('Both')) print(df_per.show(truncate=False)) # write to pickle file # df_per.rdd.saveAsPickleFile('final_prediction_df.pkl') # Read from pickle file """for obj in sparkpickle.load_gen("final_prediction_df.pkl"):
print("Reading our data...") df = pd.read_csv("titles_and_imdb-id.csv") df = df.drop(["Unnamed: 0"], axis=1) reviews = df["First Review"] reviews_list = list(reviews) # # Sentimental Analysis from reviews (IMDb) print("Let's see how Natural Language Processing can help us!") list_of_results = [] for i in range(len(reviews_list)): sentiment_analyzer = SentimentIntensityAnalyzer() scores = sentiment_analyzer.polarity_scores(reviews_list[i]) list_of_results.append(scores) # Let's create a new dataframe with the sentimental analysis information sentiment_analyzer = pd.DataFrame.from_dict(list_of_results) sentiment_analyzer = sentiment_analyzer.rename(columns={ "compound": "Compound", "neg": "Negative", "neu": "Neutral", "pos": "Positive" }) # # Concat both DataFrames
# predict on test set; after prediction, test set will have with 3 cols: text, truth, pred df_test = logistic_regressor.predict_file(training_text_file) print(df_test) training_logger.info('prediction done by logistic_regressor') # to calculate & print the accuracy & F1 score on test set print("Logistic Regression model performance:") logistic_regressor.print_performance(df_test) # to save the pipeline as model model = logistic_regressor.pipeline else: """ load Vader sentiment model is no training text file provided """ model = SentimentIntensityAnalyzer() training_logger.info('model: ' + str(model)) # save model files to disk for app.py to load save_model_files('sentiment_model_pickle', # filename model, # model type(model), # model_type 'sentiment-analysis', # model_name str(strftime('%Y%m%d-%H%M%S', localtime())), # model_version 'train', # train_pred training_logger) # logger, not for saving
def get_distribution(): all_data, all_labels = extract_csv('../data/comments.csv', LABELS, CATEGORY) all_data = np.asarray([[x] for x in all_data], dtype="S1000") all_labels = np.asarray(all_labels) num_trials = TRIALS accuracies = [] for trial in range(num_trials): if (trial % 10) == 0: print("Trial:", trial) randomization_scheme = np.arange(len(all_data)) np.random.shuffle(randomization_scheme) randomized_data = all_data[randomization_scheme] randomized_labels = all_labels[randomization_scheme] train_messages = randomized_data[len(all_data) // VAL_SPLIT:] train_labels = randomized_labels[len(all_data) // VAL_SPLIT:] val_messages = randomized_data[:len(all_data) // VAL_SPLIT] val_labels = randomized_labels[:len(all_data) // VAL_SPLIT] dictionary = create_dictionary(train_messages) train_matrix = transform_text(train_messages, dictionary) val_matrix = transform_text(val_messages, dictionary) if MODEL_CHOICE is "LOGREG": logreg = LogisticRegression() logreg.fit(train_matrix, train_labels) logistic_regresion_predictions = logreg.predict(val_matrix) logistic_regresion_accuracy = np.mean( logistic_regresion_predictions == val_labels) accuracies.append(logistic_regresion_accuracy) elif MODEL_CHOICE is "NAIVE": naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels, LABELS) naive_bayes_predictions = predict_from_naive_bayes_model( naive_bayes_model, val_matrix) naive_bayes_accuracy = np.mean( naive_bayes_predictions == val_labels) accuracies.append(naive_bayes_accuracy) elif MODEL_CHOICE is "OFFSHELF": sid = SentimentIntensityAnalyzer() converted = [x[0].decode('utf-8') for x in val_messages] sid_predictions = predict_from_off_shelf_model(sid, converted) sid_accuracy = np.mean(sid_predictions == val_labels) accuracies.append(sid_accuracy) plt.figure() plt.hist(accuracies, bins=BINS, label='data', weights=np.ones(num_trials) / num_trials) plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) plt.title("Accuracy Distribution for " + MODEL_CHOICE[0] + MODEL_CHOICE[1:].lower() + " Model") plt.xlabel("Accuracy") plt.ylabel("Percentage") accuracies = np.asarray(accuracies) plt.axvline(x=np.mean(accuracies), color='red', linestyle='--', label='mean') plt.savefig(MODEL_CHOICE.lower() + "_acc_dist.png")
def get(self, request, format=None): all_keys = request.query_params.get('keyword', None) start_date = request.query_params.get('start_date', None) end_date = request.query_params.get('end_date', None) if all_keys[0] == 'h' and all_keys[1] == 't' and all_keys[ 2] == 't' and all_keys[3] == 'p': all_d = all_keys.split('/') all_keys = all_d[-2] if len(all_keys) == 1: all_keys = all_d[-1] print(all_keys) z = redditdata.objects.filter(keyword=all_keys).exists() if z == True: reddit = praw.Reddit(client_id='GZ4wXpp55Rzjqw', client_secret='nusWMTnlf0nLWOHWDcFcFi1RXQY', user_agent='data') print("fffsafasfasfsafasfasfsfasdsdasdasdasdasdsada", all_keys) data = redditdata.objects.filter(keyword=all_keys) print(data) global title, score, url, created, id_id, all_date, body for i in data: print("jai", i.score) print(i.keyword) print(i.all_date) print(i.body) title.append(i.title) score.append(i.score) url.append(i.url) created.append(i.created) id_id.append(i.id_id) all_date.append(i.all_date) body.append(i.body) print("scorescorescorescorescorescorescore", score, len(score), len(url), len(created), len(id_id), len(body)) topics_dict = { 'Title': title, 'Score': score, 'id': id_id, 'Url': url, 'Created': created, 'Date Time': all_date, 'Body': body } df = pd.DataFrame(topics_dict) print(df) all_comments = [] for ids in df.id: # print("idsidsidsidsidsidsidsidsidsidsids",ids) each_subreddit_comments = [] for top_level_comment in reddit.submission(id=ids).comments: each_subreddit_comments.append(top_level_comment.body) all_comments.append(each_subreddit_comments) each_subreddit_comments = [] sid = SentimentIntensityAnalyzer() final_sentiments_list = [] entered = 0 for each_ in all_comments: sentiments_list = [] for every_comment in each_: entered = 1 polarity_dict = sid.polarity_scores(every_comment) negative = polarity_dict['neg'] positive = polarity_dict['pos'] neutral = polarity_dict['neu'] if negative > positive and negative > neutral: sentiments_list.append('negative') continue if positive > negative and positive > neutral: sentiments_list.append('positive') continue if neutral > positive and neutral > positive: sentiments_list.append('neutral') continue if positive == negative: sentiments_list.append('neutral') continue final_sentiments_list.append(sentiments_list) df['Comments'] = all_comments sen = [] for i in final_sentiments_list: try: sen.append(i[1]) except: sen.append("neutral") df['Sentiments'] = sen df2 = (df['Date Time'] > start_date) & (df['Date Time'] <= end_date) df2 = df.loc[df2] print(df2) dic = {} li = [] for Title, Score, i, Url, Cre, Body, all_date, Comments, Sentiments in zip( df2["Title"], df2["Score"], df2["id"], df2["Url"], df2["Created"], df2["Body"], df2["all_date"], df2["Comments"], df2["Sentiments"]): li.append({ "Title": Title, "Score": Score, "id": i, "Url": Url, "Created": Cre, "Body": Body, "all_date": all_date, "Comments": Comments, "Sentiments": Sentiments }) dic.update({"data": li}) return Response(dic) else: print("This keyword is not exists in database please run post api") return Response( "This keyword(url) is not exists in database please run post api" )
def compute_nltk_polarity(msg_body): nltk.data.path.append("./nltk_data.zip/nltk_data") sid = SentimentIntensityAnalyzer() msg_body = sid.polarity_scores(msg_body) return msg_body
def sentiment(sentence): analyser = SentimentIntensityAnalyzer() ss = analyser.polarity_scores(sentence) #print ss.keys() return ss['compound']
def get_sentiment(sentences: list): return round( SentimentIntensityAnalyzer().polarity_scores( '. '.join(sentences))['compound'], 4)
def open_spider(self, spider): #initialize sentiment analyzer self.analyzer = SentimentIntensityAnalyzer() self.analyzer.lexicon.update(lm_positive) self.analyzer.lexicon.update(lm_negative)
def get(self, request, format=None): id = request.query_params.get('id', None) data = custmers.objects.get(id=id) all_keys = data.name start_date = data.start_date end_date = data.end_date start_date = request.query_params.get('start_date', None) end_date = request.query_params.get('end_date', None) if all_keys[0] == 'h' and all_keys[1] == 't' and all_keys[ 2] == 't' and all_keys[3] == 'p': all_d = all_keys.split('/') all_keys = all_d[-2] if len(all_keys) == 1: all_keys = all_d[-1] print(all_keys) z = redditdata.objects.filter(keyword=all_keys).exists() if z == True: reddit = praw.Reddit(client_id='GZ4wXpp55Rzjqw', client_secret='nusWMTnlf0nLWOHWDcFcFi1RXQY', user_agent='data') print("fffsafasfasfsafasfasfsfasdsdasdasdasdasdsada", all_keys) data = redditdata.objects.filter(keyword=all_keys) print(data) global title, score, url, created, id_id, all_date, body for i in data: print("jai", i.score) print(i.keyword) print(i.all_date) print(i.body) title.append(i.title) score.append(i.score) url.append(i.url) created.append(i.created) id_id.append(i.id_id) all_date.append(i.all_date) body.append(i.body) print("scorescorescorescorescorescorescore", score, len(score), len(url), len(created), len(id_id), len(body)) topics_dict = { 'Title': title, 'Score': score, 'id': id_id, 'Url': url, 'Created': created, 'Date Time': all_date, 'Body': body } df = pd.DataFrame(topics_dict) print(df) all_comments = [] for ids in df.id: # print("idsidsidsidsidsidsidsidsidsidsids",ids) each_subreddit_comments = [] for top_level_comment in reddit.submission(id=ids).comments: each_subreddit_comments.append(top_level_comment.body) all_comments.append(each_subreddit_comments) each_subreddit_comments = [] sid = SentimentIntensityAnalyzer() final_sentiments_list = [] entered = 0 for each_ in all_comments: sentiments_list = [] for every_comment in each_: entered = 1 polarity_dict = sid.polarity_scores(every_comment) negative = polarity_dict['neg'] positive = polarity_dict['pos'] neutral = polarity_dict['neu'] if negative > positive and negative > neutral: sentiments_list.append('negative') continue if positive > negative and positive > neutral: sentiments_list.append('positive') continue if neutral > positive and neutral > positive: sentiments_list.append('neutral') continue if positive == negative: sentiments_list.append('neutral') continue final_sentiments_list.append(sentiments_list) df['Comments'] = all_comments sen = [] for i in final_sentiments_list: try: sen.append(i[1]) except: sen.append("neutral") df['Sentiments'] = sen df2 = (df['Date Time'] > start_date) & (df['Date Time'] <= end_date) df2 = df.loc[df2] print(df2) dic = {} li = [] li.append([ "Title", "Score", "id", "Url", "Created", "Body", "Date Time", "Comments", "Sentiments" ]) for Titlez, Scorez, iz, Urlz, Crez, Bodyz, all_datz, Commentsz, Sentimentsz in zip( df2["Title"], df2["Score"], df2["id"], df2["Url"], df2["Created"], df2["Body"], df2["Date Time"], df2["Comments"], df2["Sentiments"]): li.append([ Titlez, Scorez, iz, Urlz, Crez, Bodyz, all_datz, Commentsz, Sentimentsz ]) print(len(li), "cccccccccccccc111111111111111111111111111111111111111") filename = "reddit_data.csv" fp = StringIO() response = HttpResponse(content_type='text/csv') response[ 'Content-Disposition'] = 'attachment; filename="{}"'.format( filename) writer = csv.writer(response) for row in li: writer.writerow(row) return response else: print("This keyword is not exists in database please run post api") return Response( "This keyword(url) is not exists in database please run post api" )
def __init__(self,text): self.text = text self.ytSenti = SentimentIntensityAnalyzer()
def textMinin(self): plt.clf() plik = p1.onOpen(self) review = pd.read_csv(plik) # tworzenie oznaczenia recenzji o zlym wydzwieku (ocena < 5) review["is_bad_review"] = review["recomend"].apply( lambda x: 1 if x == "Not Recommended" else 0) # wybieranie tylko potrzebych kolumn review = review[["review", "is_bad_review"]] # zamiana danych w kolumnie "review" na string review['review'] = review['review'].astype(str) review.head() # Podzial danych na probke #review = review.sample(frac = 0.3, replace = False, random_state=42) # obrabianie danych def get_wordnet_pos(pos_tag): if pos_tag.startswith('J'): return wordnet.ADJ elif pos_tag.startswith('V'): return wordnet.VERB elif pos_tag.startswith('N'): return wordnet.NOUN elif pos_tag.startswith('R'): return wordnet.ADV else: return wordnet.NOUN def clean_text(text): # male litery text = text.lower() # tokenizacja i usuwanie interpunkcji text = [word.strip(string.punctuation) for word in text.split(" ")] # usuwanie slow zawierajacych cyfry text = [ word for word in text if not any(c.isdigit() for c in word) ] # usuwanie "stop" slow ('the', 'a' ,'this') stop = stopwords.words('english') text = [x for x in text if x not in stop] # usuwanie pustych tokenow text = [t for t in text if len(t) > 0] # oznaczanie slow POS (rzeczownik,przymiotnik,itd) pos_tags = pos_tag(text) # lemmanizacja tekstu (odmieniona forma do bezokolicznika, jesli istnieje) text = [ WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags ] # usuwanie slow jednoliterowych text = [t for t in text if len(t) > 1] # fuzja tekstu text = " ".join(text) return (text) review["review_clean"] = review["review"].apply( lambda x: clean_text(x)) # uzycie Vader do sprawdzenia nastroju slow do odroznienia negatywnych od pozytywnych sid = SentimentIntensityAnalyzer() review["sentiments"] = review["review"].apply( lambda x: sid.polarity_scores(x)) review = pd.concat([ review.drop(['sentiments'], axis=1), review['sentiments'].apply( pd.Series) ], axis=1) # liczba liter review["nb_chars"] = review["review"].apply(lambda x: len(x)) # liczba slow review["nb_words"] = review["review"].apply( lambda x: len(x.split(" "))) # reprezentacja wektorowa kazdej recenzji documents = [ TaggedDocument(doc, [i]) for i, doc in enumerate( review["review_clean"].apply(lambda x: x.split(" "))) ] # trening Doc2Vec model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4) # przetwarzanie danych do danych wektorowych (Wymagane w Doc2Vec) doc2vec_df = review["review_clean"].apply( lambda x: model.infer_vector(x.split(" "))).apply(pd.Series) doc2vec_df.columns = [ "doc2vec_vector_" + str(x) for x in doc2vec_df.columns ] review = pd.concat([review, doc2vec_df], axis=1) # dodawanie wartosci TF-IDF dla kazdego slowa tfidf = TfidfVectorizer(min_df=10) tfidf_result = tfidf.fit_transform(review["review_clean"]).toarray() tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf.get_feature_names()) tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns] tfidf_df.index = review.index review = pd.concat([review, tfidf_df], axis=1) # pokazanie dystrybucji procentowej dobrych do zlych recenzji f = open("wyniki recenzji.txt", "w") print('dystrybucja dobrych do zlych recenzji:') print(review["is_bad_review"].value_counts(normalize=True)) f.write('dystrybucja dobrych do zlych recenzji:\n') f.write(str(review["is_bad_review"].value_counts(normalize=True))) f.close() def show_wordcloud(data, title=None): wordcloud = WordCloud(background_color='white', max_words=200, max_font_size=40, scale=3, random_state=42).generate(str(data)) fig = plt.figure(1, figsize=(20, 20)) plt.axis('off') if title: fig.suptitle(title, fontsize=20) fig.subplots_adjust(top=2.3) plt.imshow(wordcloud) self.canvas.draw() show_wordcloud(review["review"]) # wypisanie 10 najbardziej pozytywnych recenzji print('wypisanie 10 najbardziej pozytywnych recenzji:') print(review[review["nb_words"] >= 5].sort_values( "pos", ascending=False)[["review", "pos"]].head(10)) f = open("wyniki recenzji.txt", "a") f.write('\nwypisanie 10 najbardziej pozytywnych recenzji:\n') f.write( str(review[review["nb_words"] >= 5].sort_values( "pos", ascending=False)[["review", "pos"]].head(10))) f.close()
def sentiment_analysis(text, comma_threshold=1, interval_threshold=15): result = {} text = check_comma(text, comma_threshold, interval_threshold) text = text_preprocess(text) is_suggestion = False is_negative = False try: language = language_detect(text) if language == "en": sentences = sent_tokenize(text) analyzer = SentimentIntensityAnalyzer() for sentence in sentences: for keyword in suggestion_en: is_suggestion = keyword_match(keyword, sentence) is_suggestion = False if is_suggestion: result[sentence] = "SUGGESTION" break if not is_suggestion: score = analyzer.polarity_scores(sentence) if (score['neg'] > 0.09 and score["compound"] < 0.1 ) or score["compound"] < -0.3: result[ sentence] = "NEGATIVE, neg: %s, pos: %s, neu: %s, compound: %s" % ( score["neg"], score["pos"], score["neu"], score["compound"]) elif score["compound"] > 0.3 and score['pos'] > 0.17: result[ sentence] = "POSITIVE, neg: %s, pos: %s, neu: %s, compound: %s" % ( score["neg"], score["pos"], score["neu"], score["compound"]) else: result[ sentence] = "NEUTRAL, neg: %s, pos: %s, neu: %s, compound: %s" % ( score["neg"], score["pos"], score["neu"], score["compound"]) return result if language == "fr": sentences = sent_tokenize(text, language="french") analyzer = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()) for sentence in sentences: for keyword in suggestion_fr: is_suggestion = keyword_match(keyword, sentence) if is_suggestion: result[sentence] = "SUGGESTION" break for keyword in neg_fr: is_negative = keyword_match(keyword, sentence) if is_negative: result[sentence] = "NEGATIVE" break if not is_suggestion and not is_negative: score = analyzer(sentence).sentiment if (score[0] < 0 and score[1] > 0) or (score[0] < 0.1 and score[1] > 0.25): result[ sentence] = "NEGATIVE sentiment: %s, subjectivity %s" % ( score[0], score[1]) elif score[0] > 0.2: result[ sentence] = "POSITIVE sentiment: %s, subjectivity %s" % ( score[0], score[1]) else: result[ sentence] = "NEUTRAL sentiment: %s, subjectivity %s" % ( score[0], score[1]) return result except: result["Message"] = "MEANINGLESS" return result
def score(): cursor.execute(updateNulls) #safety check to catch bad text scraping sid = SentimentIntensityAnalyzer() #create sentiment analyzer object numTopics = 3 #how many topics would you like the model to find numWords = 7 #how many words would you like to view passes = 20 #how many times do you want to go over the data #a list of hardcoded words to ignore - created from analyzing previous trials stopWords = [ 'the', 'like', 'ya', 'wanna', 'know', 'let', 'got', '4', 'yeah', 'ooh', 'yo', 'went', 'ric', '2', 'need', 'seen', 'word', 'huh', 'said' 'big', 'whatchu', 'el', 'gonna', 'cause', 'things', 'gon', 'thing', 'letting', 'goes', 'tell', 'gave', 'great', '10', 'uh', '25', 'said', 'stuff', 'tho', 'gotta', '100', 'al', 'lot', 'bout', 'boi', 'dem', 'oh', 'ooooahahh', '80', 'ig', 'ev', 'ayy', '85', 'vro', 'ok', 'ha', 'tings', 'nah', 'em', 'wit', 'mi', '6', '21', 'la', 'x2', 'ay', 'du', 'ba', 'im', 'ahhhh', '7', '12', 'yaaaaa', 'ee', 'waaaaaaa', 'mmm', 'na', 'buh', 'ga', 'da', 'iii', '47', 'ol', 'une', '0', '1', '2015' ] cursor.execute(queryGrabSongs) data = cursor.fetchall() #grab all of the song data with lyrics print("Percent of total saved songs analyzed: ", str(round((len(data) / data[-1][0]), 3) * 100) + '%') #for each song with lyrics, individually score and update the vader column then grab the topics using an LDA model for each unique corpora for row in data: #safety check to make sure there are lyrics if row[5]: lyrics = row[5] rowId = row[0] #score the song lyrics grabbed and update the vader score column in the main table ss = sid.polarity_scores(lyrics) cursor.execute(updateSongVaders, (ss.get('compound'), rowId)) lines = lyrics.split( '\n' ) #create a list of lines in song lyrics for the bag of words #makes a list of the indiviudal words for the dict and corpus words = [[ word.strip() for word in line.lower().split() if word not in STOPWORDS and word not in stopWords and word.isalnum() ] for line in lines] #dict and corpus from list of words dictionary = corpora.Dictionary(words) corpus = [dictionary.doc2bow(text) for text in words] try: #use lda on bag of words to find topics lda = LdaModel(corpus, id2word=dictionary, num_topics=numTopics, passes=passes) #for each of the expected 3 topics, score the topic words as well as insert the topic words and vader score for topic in lda.print_topics(num_words=numWords): listOfTerms = topic[1].split('+') wordList = [] for term in listOfTerms: listItems = term.split('*') wordList.append(listItems[1].replace('"', '')) ss2 = sid.polarity_scores(' '.join(wordList)) cursor.execute( insertTopics, (row[2], wordList[0], wordList[1], wordList[2], wordList[3], wordList[4], wordList[5], wordList[6], ss2.get('compound'))) except: #safety check to make sure techno and EDM songs with overly repetitve lyrics are not added print('Bag of Words too small') print('-----DONE!-----') print(len(data), 'Scores and Topics Added')
def train_test(): all_data, all_labels = extract_csv('../data/comments.csv', LABELS, CATEGORY) all_data = np.asarray(all_data) all_labels = np.asarray(all_labels) # dictionary to count top instances of words aggregator = [] for i in range(len(LABELS)): aggregator.append(dict()) randomization_scheme = np.arange(len(all_data)) np.random.shuffle(randomization_scheme) randomized_data = all_data[randomization_scheme] randomized_labels = all_labels[randomization_scheme] num_trials = 10 X_trains, y_trains, X_vals, y_vals = split_folds( randomized_data.reshape((len(all_data), 1)), randomized_labels, num_trials) all_predictions = None for trial in range(num_trials): train_messages = X_trains[trial] train_labels = y_trains[trial] val_messages = X_vals[trial] val_labels = y_vals[trial] dictionary = create_dictionary(train_messages) train_matrix = transform_text(train_messages, dictionary) val_matrix = transform_text(val_messages, dictionary) if MODEL_CHOICE is "LOGREG": logreg = LogisticRegression() logreg.fit(train_matrix, train_labels) #logistic_regresion_model = fit_logistic_regresion_model(train_matrix, train_labels, labels) #logistic_regresion_predictions = predict_from_logistic_regresion_model(logistic_regresion_model, val_matrix, True, val_messages) #logistic_regresion_predictions = predict_from_logistic_regresion_model(logistic_regresion_model, val_matrix) logistic_regresion_predictions = logreg.predict(val_matrix) logistic_regresion_accuracy = np.mean( logistic_regresion_predictions == val_labels) print( 'Logistic Regression had an accuracy of {} on the testing set'. format(logistic_regresion_accuracy)) #get_top_five_logistic_regresion_words(logreg, dictionary, aggregator) #write_csv_mixed(val_messages, val_labels, logistic_regresion_predictions, trial, CONFUSION_MATRIX_OPTION) elif MODEL_CHOICE is "NAIVE": naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels, LABELS) naive_bayes_predictions = predict_from_naive_bayes_model( naive_bayes_model, val_matrix) naive_bayes_accuracy = np.mean( naive_bayes_predictions == val_labels) print( 'Naive Bayes had an accuracy of {} on the testing set'.format( naive_bayes_accuracy)) get_top_five_logistic_regresion_words(naive_bayes_model, dictionary, aggregator) #write_csv_mixed(val_messages, val_labels, naive_bayes_predictions, trial, CONFUSION_MATRIX_OPTION) elif MODEL_CHOICE is "OFFSHELF": sid = SentimentIntensityAnalyzer() converted = [x[0].decode('utf-8') for x in val_messages] sid_predictions = predict_from_off_shelf_model(sid, converted) sid_accuracy = np.mean(sid_predictions == val_labels) print('Off The Shelf had an accuracy of {} on the testing set'. format(sid_accuracy)) #write_csv_mixed(val_messages, val_labels, sid_predictions, trial, CONFUSION_MATRIX_OPTION) # Just do this once if trial == 0: all_matrix = transform_text(all_data, dictionary) if MODEL_CHOICE is "LOGREG": all_predictions = logreg.predict(all_matrix) elif MODEL_CHOICE is "NAIVE": all_predictions = predict_from_naive_bayes_model( naive_bayes_model, all_matrix) elif MODEL_CHOICE is "OFFSHELF": all_predictions = predict_from_off_shelf_model( sid, randomized_data) plot_frequent_words(LABELS, aggregator)
def calculate_sentiment(phrase): # Worst hotel ive stayed in. - The lock housing was exposed meaning it wasnt difficult to break into our room. - No safety deposit boxes in rooms. - Hot water constantly running out. - - Virtually no cooking utensils, making a basic task such as hard boiling an egg extremely difficult. df = pd.read_csv(f'{main_path}/data/adjectives_lexicon.csv', delimiter=",", usecols=['word', 'sentiment']) print(df) tokens = nltk.word_tokenize(phrase.lower()) print(tokens) print() print('=====================') print('sentiment lexicon - word by word') print('=====================') sentim_scores = [] for key in tokens: try: x = df[df.word.str.contains(r'\b{}\b'.format(key), na=False)] print(x) print(x.iloc[0]) sentim_scores.append(list(x.values[0])[1]) except: sentim_scores.append(0.1) # print(sentim_scores) mean_sent_score = mean(sentim_scores) print(mean_sent_score) if mean_sent_score > 0: overall_sentiment = 1 else: overall_sentiment = -1 print(overall_sentiment) tknzr = TweetTokenizer() def lemmatize(text): '''lemmatizes text for the given pos tags - NN,VB,JJ''' wnl = WordNetLemmatizer() for word, tag in pos_tag(tknzr.tokenize(text)): if tag.startswith("NN"): yield wnl.lemmatize(word, pos='n') elif tag.startswith('VB'): yield wnl.lemmatize(word, pos='v') elif tag.startswith('JJ'): yield wnl.lemmatize(word, pos='a') else: yield word print() print('=====================') print('Textblob') print('=====================') from textblob.sentiments import NaiveBayesAnalyzer textblob_sent_score = TextBlob(doc, analyzer=NaiveBayesAnalyzer()).polarity print(textblob_sent_score) if textblob_sent_score > 1: overall_textblob_sentiment = 1 elif textblob_sent_score < 1: overall_textblob_sentiment = -1 else: overall_textblob_sentiment = 0 print(overall_textblob_sentiment) print() print('=====================') print('Vader Sentiment Intensity Analyzer') print('=====================') sid = SentimentIntensityAnalyzer() vader_sent_score = sid.polarity_scores(doc) print(vader_sent_score) if vader_sent_score['compound'] > 1: overall_vader_sentiment = 1 elif vader_sent_score['compound'] < 1: overall_vader_sentiment = -1 else: overall_vader_sentiment = 0 print(overall_vader_sentiment) print() print('=====================') print('Afinn Analyzer') print('=====================') # https://github.com/fnielsen/afinn afinn = Afinn() afinn_sent_score = afinn.score(doc) print(afinn_sent_score) if afinn_sent_score > 0: overall_afinn_sentiment = 1 else: overall_afinn_sentiment = -1 print(overall_afinn_sentiment) print() print('=====================') print('ML model') print('=====================') trained_model_score = trained_sentiment_model([doc]) if trained_model_score == 'positive': overall_model_sentiment = -1 else: overall_model_sentiment = -1 print('overall_model_sentiment') print(overall_model_sentiment) prob_sentiment = mean([ overall_sentiment, overall_vader_sentiment, overall_afinn_sentiment, overall_textblob_sentiment, overall_model_sentiment ]) print() print('prob_sentiment:') print(prob_sentiment) if prob_sentiment < 0: calculated_sentiment = 'neg' else: calculated_sentiment = 'pos' print(calculated_sentiment)
def get_mode_sentiment(): all_data, all_labels = extract_csv('../data/comments.csv', LABELS, CATEGORY) all_data = np.asarray([[x] for x in all_data], dtype="S1000") all_labels = np.asarray(all_labels) predictions = [] for _ in range(len(all_labels)): predictions.append([]) trial = 0 not_done = True while not_done: randomization_scheme = np.arange(len(all_data)) np.random.shuffle(randomization_scheme) useful = False for i in range(len(all_data) // VAL_SPLIT): if len(predictions[randomization_scheme[i]]) < MAX_MODE: useful = True break if useful: trial += 1 if (trial % 10) == 0: print("Trial:", trial) randomized_data = all_data[randomization_scheme] randomized_labels = all_labels[randomization_scheme] train_messages = randomized_data[len(all_data) // VAL_SPLIT:] train_labels = randomized_labels[len(all_data) // VAL_SPLIT:] val_messages = randomized_data[:len(all_data) // VAL_SPLIT] val_labels = randomized_labels[:len(all_data) // VAL_SPLIT] dictionary = create_dictionary(train_messages) train_matrix = transform_text(train_messages, dictionary) val_matrix = transform_text(val_messages, dictionary) guesses = None if MODEL_CHOICE is "LOGREG": logreg = LogisticRegression() logreg.fit(train_matrix, train_labels) guesses = logreg.predict(val_matrix) elif MODEL_CHOICE is "NAIVE": naive_bayes_model = fit_naive_bayes_model( train_matrix, train_labels, LABELS) guesses = predict_from_naive_bayes_model( naive_bayes_model, val_matrix) elif MODEL_CHOICE is "OFFSHELF": sid = SentimentIntensityAnalyzer() converted = [x[0].decode('utf-8') for x in val_messages] guesses = predict_from_off_shelf_model(sid, converted) for i in range(len(guesses)): if len(predictions[randomization_scheme[i]]) < MAX_MODE: predictions[randomization_scheme[i]].append(guesses[i]) total = 0 for i in range(len(predictions)): total += len(predictions[i]) if total == len(all_data) * MAX_MODE: not_done = False total_all_sentiment = np.zeros(len(LABELS)) total_mode_sentiment = np.zeros(len(LABELS)) per_comment_proportion = np.zeros((len(predictions), len(LABELS))) mode_sentiment = np.zeros(len(predictions)) for i in range(len(predictions)): curr_sentiment = np.zeros(len(LABELS)) for label in predictions[i]: curr_sentiment[label - 1] += 1 total_all_sentiment[label - 1] += 1 mode_sentiment[i] = np.argmax(curr_sentiment) + 1 per_comment_proportion[i] = curr_sentiment / sum(curr_sentiment) total_mode_sentiment[int(mode_sentiment[i]) - 1] += 1 total_all_sentiment = total_all_sentiment / sum(total_all_sentiment) total_mode_sentiment = total_mode_sentiment / sum(total_mode_sentiment) print("Total proportion of all sentiments:", total_all_sentiment) print("Total proportion of sentiment by mode:", total_mode_sentiment) print("Total Distance:", np.sum(np.abs(total_all_sentiment - total_mode_sentiment))) my_accuracy = np.mean(mode_sentiment == all_labels) print("This is the accuracy:", my_accuracy) mimic_qualtrics("../results/mimic_results2.csv", "../data/all_survey_responses.csv", "../data/comments.csv", all_data.flatten(), all_labels, mode_sentiment.astype(int))
def analyzeStocks(): html_tables = {} # For every table in the datasets folder... for table_name in os.listdir('datasets'): #this is the path to the file. table_path = f'datasets/{table_name}' # Open as a python file in read-only mode table_file = open(table_path, 'r') # Read the contents of the file into 'html' html = BeautifulSoup(table_file) html_table = html.find(id="news-table") # Adding the table to our dictionary html_tables[table_name] = html_table # Read one single day of headlines tsla = html_tables['tsla_22sep.html'] # Get all the table rows tagged in HTML with <tr> into 'tesla_tr' tsla_tr = tsla_tr = tsla.findAll('tr') # For each row... for i, table_row in enumerate(tsla_tr): link_text = table_row.a.get_text() data_text = table_row.td.get_text() # Print the count print(f'{i}:') # Print the contents of 'link_text' and 'data_text' print(link_text) print(data_text) # The following exits the loop after three rows to prevent spamming the notebook. if i == 3: break # Hold the parsed news into a list parsed_news = [] # Iterate through the news for file_name, news_table in html_tables.items(): # Iterate through all tr tags in 'news_table' for x in news_table.findAll('tr'): text = x.get_text() headline = x.a.get_text() date_scrape = x.td.text.split() if len(date_scrape) == 1: time = date_scrape[0] else: date = date_scrape[0] time = date_scrape[1] # Extract the ticker from the file name, get the string up to the 1st '_' ticker = file_name.split('_')[0] # Append ticker, date, time and headline as a list to the 'parsed_news' list parsed_news.append([ticker, date, time, headline]) # New words and values new_words = { 'crushes': 10, 'beats': 5, 'misses': -5, 'trouble': -10, 'falls': -100, } # Instantiate the sentiment intensity analyzer with the existing lexicon vader = SentimentIntensityAnalyzer() # Update the lexicon vader.lexicon.update(new_words) columns = ['ticker', 'date', 'time', 'headline'] scored_news = pd.DataFrame(parsed_news, columns=columns) # Iterate through the headlines and get the polarity scores scores = [ vader.polarity_scores(headline) for headline in scored_news.headline.values ] scores_df = pd.DataFrame(scores) # Join the DataFrames scored_news = pd.concat([scored_news, scores_df], axis=1) # Convert the date column from string to datetime scored_news['date'] = pd.to_datetime(scored_news.date).dt.date plt.style.use("fivethirtyeight") # Group by date and ticker columns from scored_news and calculate the mean mean_c = scored_news.groupby(['date', 'ticker']).mean() mean_c = mean_c.unstack(level=1) mean_c = mean_c.xs('compound', axis=1) mean_c.plot.bar() plt.savefig("plot1.png") # Analyzing just one day of stock trends # Set the index to ticker and date scored_news_clean = scored_news.drop_duplicates( subset=['ticker', 'headline']) single_day = scored_news_clean.set_index(['ticker', 'date']) single_day = single_day.loc['fb'] # Selecting the 3rd of January of 2019 single_day = single_day.loc['2019-01-03'] # Convert the datetime string to just the time single_day['time'] = pd.to_datetime(single_day['time']) single_day['time'] = single_day.time.dt.time single_day = single_day.set_index('time') # Sort it single_day = single_day.sort_index(ascending=True) # Visualizing sentiment for that day TITLE = "Negative, neutral, and positive sentiment for FB on 2019-01-03" COLORS = ["red", "orange", "green"] # Drop the columns that aren't useful for the plot plot_day = single_day.drop(['headline', 'compound'], axis=1) # Change the column names to 'negative', 'positive', and 'neutral' plot_day.columns = ["negative", "positive", "neutral"] plot_day.plot(kind='bar').legend(bbox_to_anchor=(1, 1)) plt.savefig("plot2.png")
for ticker, news_table in news_tables.items(): for row in news_table.findAll('tr'): title = row.a.text date_data = row.td.text.split(' ') if len(date_data) == 1: time = date_data[0] else: date = date_data[0] time = date_data[1] parsed_date.append([ticker, date, time, title]) df = pd.DataFrame(parsed_date, columns=['ticker', 'date', 'time', 'title']) vader = SentimentIntensityAnalyzer() f = lambda title: vader.polarity_scores(title)['compound'] df['compound'] = df['title'].apply(f) df['date'] = pd.to_datetime(df.date).dt.date plt.figure(figsize=(10, 8)) mean_df = df.groupby(['ticker', 'date']).mean() mean_df = mean_df.unstack() mean_df = mean_df.xs('compound', axis="columns").transpose() mean_df.plot(kind='bar') plt.show()
def news_sentiment(symbol): # Import libraries import pandas as pd from bs4 import BeautifulSoup import matplotlib.pyplot as plt from urllib.request import urlopen, Request from nltk.sentiment.vader import SentimentIntensityAnalyzer # Parameters n = 5 #the # of article headlines displayed per ticker tickers = [symbol] # Get Data finwiz_url = 'https://finviz.com/quote.ashx?t=' news_tables = {} for ticker in tickers: url = finwiz_url + ticker req = Request(url=url, headers={'user-agent': 'my-app/0.0.1'}) resp = urlopen(req) html = BeautifulSoup(resp, features="lxml") news_table = html.find(id='news-table') news_tables[ticker] = news_table try: for ticker in tickers: df = news_tables[ticker] df_tr = df.findAll('tr') # print ('\n') # print ('Recent News Headlines for {}: '.format(ticker)) for i, table_row in enumerate(df_tr): a_text = table_row.a.text td_text = table_row.td.text td_text = td_text.strip() # print(a_text,'(',td_text,')') if i == n - 1: break except KeyError: pass # Iterate through the news parsed_news = [] for file_name, news_table in news_tables.items(): for x in news_table.findAll('tr'): text = x.a.get_text() date_scrape = x.td.text.split() if len(date_scrape) == 1: time = date_scrape[0] else: date = date_scrape[0] time = date_scrape[1] ticker = file_name.split('_')[0] parsed_news.append([ticker, date, time, text]) # Sentiment Analysis analyzer = SentimentIntensityAnalyzer() columns = ['Ticker', 'Date', 'Time', 'Headline'] news = pd.DataFrame(parsed_news, columns=columns) scores = news['Headline'].apply(analyzer.polarity_scores).tolist() df_scores = pd.DataFrame(scores) news = news.join(df_scores, rsuffix='_right') # View Data news['Date'] = pd.to_datetime(news.Date).dt.date unique_ticker = news['Ticker'].unique().tolist() news_dict = { name: news.loc[news['Ticker'] == name] for name in unique_ticker } values = [] for ticker in tickers: dataframe = news_dict[ticker] dataframe = dataframe.set_index('Ticker') # dataframe = dataframe.drop(columns = ['Headline']) # print ('\n') # print (dataframe.head()) mean = round(dataframe['compound'].mean(), 2) values.append(mean) # df = pd.DataFrame(list(zip(tickers, values)), columns =['Ticker', 'Mean Sentiment']) # df = df.set_index('Ticker') # df = df.sort_values('Mean Sentiment', ascending=False) return dataframe
def __init__(self, response_time=False): if response_time: self.waiting_response = {} self.response_time = response_time self.sentiment_analyzer = SentimentIntensityAnalyzer()
def sentiment_of_lyrics(lyrics): if lyrics != "": sid = SentimentIntensityAnalyzer() return sid.polarity_scores(lyrics) else: return {'neg': 'none', 'neu': 'none', 'pos': 'none', 'compound': 'none' }
def __init__(self, model_file: str = None) -> None: super().__init__() self.vader_sia = SentimentIntensityAnalyzer()
def make_graphs(): fs = FileSystemStorage() nlp = spacy.load("en_core_web_sm") path = settings.MEDIA_ROOT mentions = pd.read_csv((path + "/csv/comments_and_mentions.csv"), encoding="utf8") mentions.head() text = mentions['content.text'].unique() text_string = np.array2string(text) nltk.download('punkt') nltk.download('vader_lexicon') nltk.download('stopwords') words = nltk.tokenize.word_tokenize(text_string) wordList = [] sentences = nltk.tokenize.sent_tokenize(text_string) stop_words = nltk.corpus.stopwords.words('english') punctuations = list(string.punctuation) # print(punctuations) for i in range(len(words)): words[i] = words[i].lower() for word in words: # iterate over word_list if word in nltk.corpus.stopwords.words('english'): try: while True: words.remove(word) except ValueError: pass wordList.append(word) for punctuation in punctuations: if punctuation in words: try: while True: words.remove(punctuation) except ValueError: pass wordList.append(punctuation) clean_text = [] for i in text: doc = nlp(i) for token in doc: if token.is_alpha and not token.is_stop: clean = { 'text': token.text, 'lemma': token.lemma_, 'part_of_speech': token.pos_, 'pos_tag': token.tag_} clean_text.append(clean) df = pd.DataFrame(clean_text) # saving the dataframe df.to_csv(os.path.join(path, 'csv/nlu-text.csv')) try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context from nltk.sentiment.vader import SentimentIntensityAnalyzer sid = SentimentIntensityAnalyzer() text_df = pd.DataFrame(data=text, columns=['text']) text_df['scores'] = text_df['text'].apply(lambda comment: sid.polarity_scores(comment)) text_df['scores'].head() text_df['compound'] = text_df['scores'].apply(lambda score_dict: score_dict['compound']) text_df.head() text_df['comp_score'] = text_df['compound'].apply(lambda c: 'pos' if c >= 0 else 'neg') text_df.head() neg_text = text_df.where(text_df['comp_score'] == 'neg').dropna() neg_text.head() pos_text = text_df.where(text_df['comp_score'] == 'pos').dropna() pos_text.head() text_df.to_csv(os.path.join(path, 'csv/sentiment-text.csv'), encoding='utf8') stopwords = set(STOPWORDS) stopwords.add("Philip") # Generate a word cloud image wordcloud = WordCloud(background_color="white", stopwords=stopwords).generate(df['text'].to_string()) # Display the generated image: # the matplotlib way: plt.imshow(wordcloud, interpolation='bilinear') plt.title("All Conversation Words") plt.axis("off") if fs.exists(os.path.join(path, "graphs/sent_analysis_all_words.png")): # if file exists, overwrite with new file os.remove(os.path.join(settings.MEDIA_ROOT, os.path.join(path, "graphs/sent_analysis_all_words.png"))) plt.savefig(path + "/graphs/sent_analysis_all_words.png") # lower max_font_size wordcloud = WordCloud(background_color="white", max_font_size=40, stopwords=stopwords, max_words=50).generate( df['text'].to_string()) plt.figure() plt.imshow(wordcloud, interpolation="bilinear") plt.title("All Conversation Words (Smaller Font)") plt.axis("off") if fs.exists(os.path.join(path, "graphs/sent_analysis_all_words_small.png")): os.remove(os.path.join(settings.MEDIA_ROOT, os.path.join(path, "graphs/sent_analysis_all_words_small.png"))) plt.savefig(os.path.join(path, "graphs/sent_analysis_all_words_small.png")) wordcloud = WordCloud(background_color="white", max_words=50).generate( df.where(df['part_of_speech'] == 'PROPN').dropna()['text'].to_string()) plt.figure() plt.imshow(wordcloud, interpolation="bilinear") plt.title("Pronouns Used") plt.axis("off") if fs.exists(os.path.join(path, "graphs/sent_analysis_pronouns.png")): os.remove(os.path.join(settings.MEDIA_ROOT, os.path.join(path, "graphs/sent_analysis_pronouns.png"))) plt.savefig(os.path.join(path, "graphs/sent_analysis_pronouns.png")) clean_neg_text = [] for i in text: doc = nlp(i) for token in doc: if token.is_alpha and not token.is_stop: clean = { 'text': token.text, 'lemma': token.lemma_, 'part_of_speech': token.pos_, 'pos_tag': token.tag_} clean_neg_text.append(clean) neg_text = pd.DataFrame(clean_neg_text) neg_text.to_csv(os.path.join(path, 'csv/neg_text_nlp.csv'), encoding='utf8') wordcloud = WordCloud(background_color="white", stopwords=stopwords, max_words=50).generate( neg_text['text'].to_string()) plt.figure() plt.imshow(wordcloud, interpolation="bilinear") plt.title("Negative Words Used") plt.axis("off") if fs.exists(os.path.join(path, "graphs/sent_analysis_neg.png")): os.remove(os.path.join(settings.MEDIA_ROOT, os.path.join(path, "graphs/sent_analysis_neg.png"))) plt.savefig(os.path.join(path, "graphs/sent_analysis_neg.png")) clean_pos_text = [] for i in text: doc = nlp(i) for token in doc: if token.is_alpha and not token.is_stop: clean = { 'text': token.text, 'lemma': token.lemma_, 'part_of_speech': token.pos_, 'pos_tag': token.tag_} clean_pos_text.append(clean) pos_text = pd.DataFrame(clean_pos_text) pos_text.to_csv(os.path.join(path, 'csv/pos_text_nlp.csv'), encoding='utf8') wordcloud = WordCloud(background_color="white", stopwords=stopwords, max_words=50).generate( pos_text['text'].to_string()) plt.figure() plt.imshow(wordcloud, interpolation="bilinear") plt.title("Positive Words Used") plt.axis("off") if fs.exists(os.path.join(path, "graphs/sent_analysis_pos.png")): os.remove(os.path.join(settings.MEDIA_ROOT, os.path.join(path, "graphs/sent_analysis_pos.png"))) plt.savefig(os.path.join(path, "graphs/sent_analysis_pos.png"))
time.sleep(1) search = UTILITY_COMPANY numberOfUsers = 1 # find and respond to tweets for tweet in tweepy.Cursor(api.search, search,lang='en').items(numberOfUsers): tweetId = tweet.user.id username = tweet.user.screen_name phrase = 'Ever thought about renewable energy? Check out my website' print (tweet.text) para = nltk.sent_tokenize(tweet.text) # this gives us a list of sentences # now loop over each sentence and tokenize it separately for sentence in para: tokenized_para = nltk.word_tokenize(sentence) tags = nltk.pos_tag(tokenized_para) print(tags) tree = chunk.ne_chunk(tags) tree tree.draw() #analyze sentiment in tweet tool = SentimentIntensityAnalyzer() for sentence in para: print(sentence) ss = tool.polarity_scores(sentence) for i in sorted(ss): print('{0}: {1}, '.format(i, ss[i]), end='') if ss["compound"] < 0.0:
class VaderServiceEn: analyzer = SentimentIntensityAnalyzer() def getScore(self, review): return self.analyzer.polarity_scores(unidecode(review.lower()))
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) len(unigram_feats) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) trainer = NaiveBayesClassifier.train classifier = sentim_analyzer.train(trainer, training_set) for key, value in sorted(sentim_analyzer.evaluate(test_set).items()): print('{0}: {1}'.format(key, value)) sid = SentimentIntensityAnalyzer() auth = OAuthHandler(ckey, csecret) auth.set_access_token(atoken, asecret) twitterStream = Stream(auth, listener()) #twitterStream.filter(track=["googl", "google", "goog"]) twitterStream.filter(track=["aapl", "apple", "iphone", "itunes"]) #ply.show() except Exception as e: print(e) print("3") pass
''' read saved tweets per company ''' companies = ['Microsoft', 'Apple', 'Mastercard', 'Intel', 'Cisco_Systems', 'GDP', 'Dow_Jones'] path = os.path.abspath(os.getcwd())+"/" file_end = "_clean_20170101_20200401.csv" companylist = {} for company in companies: filename = path+company+file_end companylist[company] = pd.read_csv(filename,sep='|',index_col = 0) ''' Sentiment Analysis ''' # Sentiment for each Tweet ---------------------------------------------------- sentiment_pt = SentimentIntensityAnalyzer() companylist_sentiment = {} for company, df in companylist.items(): df["Comp"] = '' df["Negative"] = '' df["Neutral"] = '' df["Positive"] = '' df = df.dropna(subset=['Tweets']).reset_index(drop=True) for indexx, row in tqdm(df.T.iteritems()): try: sentence_pt=unicodedata.normalize('NFKD', df.loc[indexx, 'Tweets']) sentence_pt_sentiment=sentiment_pt.polarity_scores(sentence_pt) df.at[indexx, 'Comp'] = sentence_pt_sentiment['compound'] df.at[indexx, 'Negative'] = sentence_pt_sentiment['neg'] df.at[indexx, 'Neutral'] = sentence_pt_sentiment['neu'] df.at[indexx, 'Positive'] = sentence_pt_sentiment['pos']