Beispiel #1
0
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
from twitter_get_data import query_str
import os
path = os.getcwd()
os.chdir(query_str)

data = pd.read_csv('Predicted_Emotions.csv', encoding='ISO-8859-1')
sentiment_scores = []
sentiment_type = []
sie = SentimentIntensityAnalyzer()
for i in range(len(data)):
    returned_tweet = data.iloc[:, 0][i]
    if (not pd.isnull(returned_tweet)):
        senti_score = sie.polarity_scores(returned_tweet)['compound']
        sentiment_scores.append(senti_score)
        if (senti_score > 0.0):
            sentiment_type.append('Positive')
        elif (senti_score < 0.0):
            sentiment_type.append('Negative')
        else:
            sentiment_type.append('Neutral')

    else:
        sentiment_scores.append(0.0)
        sentiment_type.append('Neutral')

data.loc[:, 'Sentiment'] = sentiment_scores
data.loc[:, 'Sentiment_Type'] = sentiment_type
data.to_csv('Final_Results.csv', index=None)
Beispiel #2
0
def write():
    import streamlit as st
    #datetime is imported so that the user's [entry, date] pair can be saved
    from datetime import datetime
    import nltk as nltk
    import joblib
    import math as math
    nltk.download('vader_lexicon')
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    import pandas as pd
    sid = SentimentIntensityAnalyzer()
    import scipy
    import torch
    import re
    #import sklearn
    from scipy import spatial
    from sentence_transformers import SentenceTransformer

    @st.cache(allow_output_mutation=True)
    def load_my_model():
        model = SentenceTransformer('distilbert-base-nli-mean-tokens')
        return model

    from transformers import pipeline
    #@st.cache(allow_output_mutation=True)
    #def load_classifier():
    #    classifier = pipeline('sentiment-analysis')
    #    return classifier
    @st.cache(allow_output_mutation=True)
    def load_isear():
        isear = pd.read_csv("isear_embed.csv")
        isear = isear.drop("index", axis=1)
        return isear

    @st.cache
    def analysis(sentence):
        model = load_my_model()
        lis = list()
        m = sid.polarity_scores(sentence)
        score = m['compound']
        a = re.split("[.!?;\n]", sentence)
        if len(a) > 2:
            b = a[len(a) - 2] + ". " + a[len(a) - 1]
            c = sid.polarity_scores(b)
            score = c['compound']
        if len(a) > 3:
            b = a[len(a) - 3] + ". " + a[len(a) - 2] + ". " + a[len(a) - 1]
            c = sid.polarity_scores(b)
            score2 = c['compound']
        else:
            score2 = 0
        EHS = pd.read_csv("EHS.csv")
        sentence_embeddings = EHS.values.tolist()
        OPTO = pd.read_csv("OPTO.csv")
        optimistic_embeddings = OPTO.values.tolist()
        #should check what happens when i do .values.tolist() a nd why i do it
        isear = load_isear()
        isear_list = isear.values.tolist()
        booleon = 0
        a_embeddings = model.encode(a)
        for j in range(len(a_embeddings)):
            for i in range(len(sentence_embeddings)):
                result = 1 - spatial.distance.cosine(sentence_embeddings[i],
                                                     a_embeddings[j])
                if result > .8:
                    booleon = booleon - 1
                    #print(a[j])
                    #st.write('You sound helpless, this sentence concerned me:', a[j])
                    break
        for j in range(len(a_embeddings)):
            for i in range(len(optimistic_embeddings)):
                result = 1 - spatial.distance.cosine(optimistic_embeddings[i],
                                                     a_embeddings[j])
                if result > .8:
                    booleon = booleon + 1
                    break
        rent = (booleon / len(a_embeddings))
        isear_feature = 0
        for j in range(len(a_embeddings)):
            for i in range(len(isear_list)):
                result = 1 - spatial.distance.cosine(isear_list[i],
                                                     a_embeddings[j])
                if result >= .8:
                    isear_feature = isear_feature - 1
                    break
        hugscore = 0
        classifier = pipeline('sentiment-analysis')
        if len(a) > 3:
            for i in range(0, len(a)):
                result = classifier(a[i])
                result = pd.DataFrame(result)
                if str(result["label"]).count("POS") > 0:
                    hugscore = hugscore + result['score']
                if str(result["label"]).count("NEG") > 0:
                    hugscore = hugscore - result['score']
        hugscore = hugscore / len(a)
        hugscore = float(hugscore)
        lis.append([rent, isear_feature, score, score2, hugscore])
        return lis

    sentence = st.text_area("what's on your mind?")
    #button = st.button()
    #the reason score is compute here and not inside st.button("analysis") is because now it'll be saved rather than refreshed if another button gets pressed
    #basically, variables inside a button aren't available outside of them.
    #need to append more than that to the list to get meaningful data out of this.
    if len(sentence) > 1:
        if sentence.count(".") == 0:
            st.write("Write more!")
        else:
            df = analysis(sentence)
            df = pd.DataFrame(df)
            df.columns = [
                "rent", "isear_feature", "score", "score3", "hugscore"
            ]
            loaded_model = joblib.load("GradientBoostedClassifier90CV.sav")
            result = loaded_model.predict(df)
            if result[0] == 0:
                score = "pessimistic"
                booleon = -3
            if result[0] == 1:
                score = "neutral"
                booleon = 0
            if result[0] == 2:
                score = "optimistic"
                booleon = 3
            #try:
            #    lis.append([df[0]])
            #except:
            #    lis = list()
            #    lis.append([df[0]])
    #need to revise output. Output should be a page of resources with a gif on top.
    if st.button('Analysis'):
        #gonna change this to if sentence.count(x) + count(y) .... < 5, then ask them to write more.
        #the model does poorly on samples less than 5 sentences
        if len(sentence) > 1:
            if sentence.count(".") + sentence.count("!") + sentence.count(
                    "?") < 5:
                st.write(
                    "I'm not smart enough to analyze this without more sentences :("
                )
                st.markdown(
                    "![Alt Text](https://media1.tenor.com/images/cedbc086995947a3e2c239f13a3661b4/tenor.gif?itemid=11992490)"
                )
            elif sentence.count("..") + sentence.count("!!") > 2:
                st.write(
                    "I can't analyze entries right now that have abnormal punctuation. Feel free to change your punctuation and try again."
                )
                st.markdown(
                    "![Alt Text](https://media1.tenor.com/images/59f338103063f0c10ee1163d48b4dd14/tenor.gif?itemid=17312786)"
                )
            else:
                st.write("you're feeling : " + score)
                if score == "pessimistic":
                    st.write("That's fine. Let it all out.")
                    st.markdown(
                        "![Alt Text](https://media.tenor.com/images/ff4a60a02557236c910f864611271df2/tenor.gif)"
                    )
                    st.write(
                        "Check out the resources tab to see how you can 'learn' optimism"
                    )
                    st.markdown(
                        "[Click here if you need extra help](https://suicidepreventionlifeline.org/chat/)"
                    )
                if score == "neutral":
                    st.write(
                        "You're just chilling. Waiting on some stuff to play out. It be like that sometimes."
                    )
                    st.markdown(
                        "![Alt Text](https://media1.tenor.com/images/0fbf51f99bccd97a825d11cb4487ce85/tenor.gif?itemid=11015213)"
                    )
                if score == "optimistic":
                    st.write("You are a ray of sunshine today! Keep it up!")
                    st.markdown(
                        "![Alt Text](https://media.tenor.com/images/2aa9b6f3a7d832c2ff1c1a406d5eae73/tenor.gif)"
                    )
    #st.header("Insert your username below to save your score")
    username = st.text_input(
        "Username (required for you to save your score & see your day-to-day changes): "
    )
    today = datetime.now()
    #st.text_input doesn't work inside the st.button()....gotta figure out why
    #^above is an old note, i know why now, I just keep it there to remind me that inside button actions are way diff than outside button actions
    if st.button('Save my score'):
        import csv
        fields = [result[0], sentence, today]
        try:
            test = open(username + ".csv", 'r')
            with open(username + ".csv", 'a') as f:
                writer = csv.writer(f)
                writer.writerow(fields)
        except FileNotFoundError:
            with open(username + ".csv", 'a') as f:
                writer = csv.writer(f)
                writer.writerow(["score", "sentence", "date"])
                writer.writerow(fields)
Beispiel #3
0
def main():
    spark, sc = init_spark()

    # Read US.Metadata Json File
    ps = sc.wholeTextFiles(r"reference\us_state_meta_latest.json").values().map(json.loads)
    broadcastStates.append(spark.sparkContext.broadcast(ps.map(lambda x: x).collect()).value)
    # print(broadcastStates)
    # print(ps.map(lambda x: x).collect())
    # for i in broadcastStates:
    #    for j in i:
    #        print(j)

    # Read Geo True Json file
    geo_true_df = spark.read.json("data\geo_true.json")
    # print(geo_true_df.printSchema())
    # print(geo_true_df.show(truncate=False))

    # Read Geo True Json file
    # geo_false_df = spark.read.json("data\geo_false.json") #.repartition(100)
    # print(geo_false_df.printSchema())
    # print(geo_false_df.show(truncate=False))

    geo_true_df = geo_true_df.filter(geo_true_df.country_code == 'US')
    geo_true_df = geo_true_df.drop('_id', 'coordinates', 'country_code').withColumnRenamed("city_state", "location")
    geo_true_df = geo_true_df.withColumn("new_location", F.lower(F.col("location")))
    geo_true_df = geo_true_df.withColumn('new_location', regexp_replace('new_location', '^[a-zA-Z\']+', ' '))
    # print(geo_true_df.show(truncate=False))
    # print(geo_true_df.count())

    # Register UDF
    geo_udf = udf(lambda x: get_new_us_code(x), StringType())
    geo_true_df = geo_true_df.withColumn('new_location', geo_udf('new_location'))
    # print(geo_true_df.show(truncate=False))

    """geo_false_df = geo_false_df.drop('lang', '_id')
    print(geo_false_df.show(truncate=False))
    print(geo_false_df.count())
    geo_false_df = geo_false_df.withColumn("new_location", F.lower(F.col("location")))
    geo_false_df = geo_false_df.withColumn('new_location', regexp_replace('new_location', '^[a-zA-Z\']+', ' '))
    geo_false_df = geo_false_df.withColumn('new_location', geo_udf('new_location'))
    print(geo_false_df.show(truncate=False))
    print(geo_false_df.count())
    
    df = unionAll(geo_true_df, geo_false_df).distinct().show()
    print(df.show(truncate=False))"""

    # tweets = df
    tweets = geo_true_df

    # print("Total Tweets \t\t\t\t: ", tweets.count())

    # joeBiden tweets excluding trump
    joe_only = tweets.filter(
        (tweets['text'].rlike("[Jj]oe|[Bb]iden") == True) & (tweets['text'].rlike("[Dd]onald|[Tt]rump") == False))
    # print("Only Joe Biden Tweets \t\t\t: ", joe_only.count())

    trump_only = tweets.filter(
        (tweets['text'].rlike("[Jj]oe|[Bb]iden") == False) & (tweets['text'].rlike("[Dd]onald|[Tt]rump") == True))
    # print("Only Donald Trump Tweets \t\t: ", trump_only.count())

    joe_and_trump = tweets.filter(
        (tweets['text'].rlike("[Dd]onald|[Tt]rump")) & (tweets['text'].rlike("[Jj]oe|[Bb]iden")))
    # print("Both Joe_Biden & Trump Tweets \t\t: ", joe_and_trump.count())

    not_joe_trump = tweets.filter(
        ~(tweets['text'].rlike("[Dd]onald|[Tt]rump")) & ~(tweets['text'].rlike("[Jj]oe|[Bb]iden")))
    # print("Tweets without Joe_Biden & Trump \t: ", not_joe_trump.count())

    sid = SentimentIntensityAnalyzer()

    udf_priority_score = udf(lambda x: sid.polarity_scores(x), returnType=FloatType())  # Define UDF function
    udf_compound_score = udf(lambda score_dict: score_dict['compound'])
    udf_comp_score = udf(lambda c: 'pos' if c >= 0.05 else ('neu' if (c > -0.05 and c < 0.05) else 'neg'))

    trump_only = trump_only.withColumn('scores', udf_priority_score(trump_only['text']))
    trump_only = trump_only.withColumn('compound', udf_compound_score(trump_only['scores']))
    trump_only = trump_only.withColumn('comp_score', udf_comp_score(trump_only['compound']))

    joe_only = joe_only.withColumn('scores', udf_priority_score(joe_only['text']))
    joe_only = joe_only.withColumn('compound', udf_compound_score(joe_only['scores']))
    joe_only = joe_only.withColumn('comp_score', udf_comp_score(joe_only['compound']))

    # print(trump_only.show(truncate=False))
    # print(joe_only.show(truncate=False))

    joe_pos_only = joe_only[joe_only.comp_score == 'pos']
    joe_neg_only = joe_only[joe_only.comp_score == 'neg']
    joe_neu_only = joe_only[joe_only.comp_score == 'neu']

    trump_pos_only = trump_only[trump_only.comp_score == 'pos']
    trump_neg_only = trump_only[trump_only.comp_score == 'neg']
    trump_neu_only = trump_only[trump_only.comp_score == 'neu']

    # print("Total Trump Tweets \t\t: ", trump_only.count())
    # print("Positive Trump Tweets \t\t: ", trump_pos_only.count())
    # print("Negative Trump Tweets \t\t: ", trump_neg_only.count())
    # print("Neutral Trump Tweets \t\t: ", trump_neu_only.count())

    # print("Total Biden Tweets \t\t: ", joe_only.count())
    # print("Positive Biden Tweets \t\t: ", joe_pos_only.count())
    # print("Negative Biden Tweets \t\t: ", joe_neg_only.count())
    # print("Neutral Biden Tweets \t\t: ", joe_neu_only.count())

    joe_pos_neg_only = joe_only.filter(joe_only['comp_score'] != 'neu')
    trump_pso_neg_only = trump_only.filter(trump_only['comp_score'] != 'neu')
    # print("Total Trump Pos & Neg Tweets Only \t\t: ", trump_pso_neg_only.count())
    # print("Total Biden Pos & Neg Tweets Only  \t\t: ", joe_pos_neg_only.count())

    dt1 = joe_only.groupBy(F.col('location')).agg(F.count('location').alias('joe_total'))
    dt2 = joe_pos_only.groupBy(F.col('location')).agg(F.count('location').alias('joe_pos'))
    dt3 = joe_neg_only.groupBy(F.col('location')).agg(F.count('location').alias('joe_neg'))

    dt4 = trump_only.groupBy(F.col('location')).agg(F.count('location').alias('trump_total'))
    dt5 = trump_pos_only.groupBy(F.col('location')).agg(F.count('location').alias('trump_pos'))
    dt6 = trump_neg_only.groupBy(F.col('location')).agg(F.count('location').alias('trump_neg'))

    # print(dt1.show(truncate=False))
    # print(dt2.show(truncate=False))
    # print(dt3.show(truncate=False))
    # print(dt4.show(truncate=False))
    # print(dt5.show(truncate=False))
    # print(dt6.show(truncate=False))

    # print(dt1.count())
    # print(dt2.count())

    dfs = [dt1, dt2, dt3, dt4, dt5, dt6]
    df_final = reduce(lambda left, right: DataFrame.join(left, right, on='location'), dfs)
    df_final = df_final.sort(F.col('joe_total').asc())
    # print(df_final.show(truncate=False))

    df_per = df_final
    df_per = df_per.withColumn('Joe Pos %', ((df_final['joe_pos'] / df_final['joe_total']) * 100))
    df_per = df_per.withColumn('Joe Neg %', ((df_final['joe_neg'] / df_final['joe_total']) * 100))
    df_per = df_per.withColumn('Trump Pos %', ((df_final['trump_pos'] / df_final['trump_total']) * 100))
    df_per = df_per.withColumn('Trump Neg %', ((df_final['trump_neg'] / df_final['trump_total']) * 100))

    df_per = df_per.withColumn("prediction", when((df_per['Joe Pos %'] > df_per['Trump Pos %']) , "Biden").
                               when((df_per['Joe Pos %'] < df_per['Trump Pos %']) , "Trump").otherwise('Both'))
    print(df_per.show(truncate=False))

    # write to pickle file
    # df_per.rdd.saveAsPickleFile('final_prediction_df.pkl')

    # Read from pickle file
    """for obj in sparkpickle.load_gen("final_prediction_df.pkl"):
Beispiel #4
0
print("Reading our data...")

df = pd.read_csv("titles_and_imdb-id.csv")
df = df.drop(["Unnamed: 0"], axis=1)
reviews = df["First Review"]
reviews_list = list(reviews)

# # Sentimental Analysis from reviews (IMDb)

print("Let's see how Natural Language Processing can help us!")

list_of_results = []

for i in range(len(reviews_list)):
    sentiment_analyzer = SentimentIntensityAnalyzer()
    scores = sentiment_analyzer.polarity_scores(reviews_list[i])

    list_of_results.append(scores)

# Let's create a new dataframe with the sentimental analysis information

sentiment_analyzer = pd.DataFrame.from_dict(list_of_results)
sentiment_analyzer = sentiment_analyzer.rename(columns={
    "compound": "Compound",
    "neg": "Negative",
    "neu": "Neutral",
    "pos": "Positive"
})

# # Concat both DataFrames
Beispiel #5
0
    # predict on test set; after prediction, test set will have with 3 cols: text, truth, pred
    df_test = logistic_regressor.predict_file(training_text_file)
    print(df_test)
    training_logger.info('prediction done by logistic_regressor')

    # to calculate & print the accuracy & F1 score on test set
    print("Logistic Regression model performance:")
    logistic_regressor.print_performance(df_test)

    # to save the pipeline as model
    model = logistic_regressor.pipeline

else:
    """
    load Vader sentiment model is no training text file provided
    """
    model = SentimentIntensityAnalyzer()
    training_logger.info('model: ' + str(model))


# save model files to disk for app.py to load
save_model_files('sentiment_model_pickle',                     # filename
                 model,                                        # model
                 type(model),                                  # model_type
                 'sentiment-analysis',                         # model_name
                 str(strftime('%Y%m%d-%H%M%S', localtime())),  # model_version
                 'train',                                      # train_pred
                 training_logger)                              # logger, not for saving


def get_distribution():

    all_data, all_labels = extract_csv('../data/comments.csv', LABELS,
                                       CATEGORY)
    all_data = np.asarray([[x] for x in all_data], dtype="S1000")
    all_labels = np.asarray(all_labels)

    num_trials = TRIALS
    accuracies = []

    for trial in range(num_trials):
        if (trial % 10) == 0:
            print("Trial:", trial)

        randomization_scheme = np.arange(len(all_data))
        np.random.shuffle(randomization_scheme)
        randomized_data = all_data[randomization_scheme]
        randomized_labels = all_labels[randomization_scheme]

        train_messages = randomized_data[len(all_data) // VAL_SPLIT:]
        train_labels = randomized_labels[len(all_data) // VAL_SPLIT:]
        val_messages = randomized_data[:len(all_data) // VAL_SPLIT]
        val_labels = randomized_labels[:len(all_data) // VAL_SPLIT]

        dictionary = create_dictionary(train_messages)
        train_matrix = transform_text(train_messages, dictionary)
        val_matrix = transform_text(val_messages, dictionary)

        if MODEL_CHOICE is "LOGREG":

            logreg = LogisticRegression()
            logreg.fit(train_matrix, train_labels)

            logistic_regresion_predictions = logreg.predict(val_matrix)
            logistic_regresion_accuracy = np.mean(
                logistic_regresion_predictions == val_labels)
            accuracies.append(logistic_regresion_accuracy)

        elif MODEL_CHOICE is "NAIVE":
            naive_bayes_model = fit_naive_bayes_model(train_matrix,
                                                      train_labels, LABELS)
            naive_bayes_predictions = predict_from_naive_bayes_model(
                naive_bayes_model, val_matrix)
            naive_bayes_accuracy = np.mean(
                naive_bayes_predictions == val_labels)
            accuracies.append(naive_bayes_accuracy)

        elif MODEL_CHOICE is "OFFSHELF":

            sid = SentimentIntensityAnalyzer()
            converted = [x[0].decode('utf-8') for x in val_messages]

            sid_predictions = predict_from_off_shelf_model(sid, converted)
            sid_accuracy = np.mean(sid_predictions == val_labels)
            accuracies.append(sid_accuracy)

    plt.figure()
    plt.hist(accuracies,
             bins=BINS,
             label='data',
             weights=np.ones(num_trials) / num_trials)
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
    plt.title("Accuracy Distribution for " + MODEL_CHOICE[0] +
              MODEL_CHOICE[1:].lower() + " Model")
    plt.xlabel("Accuracy")
    plt.ylabel("Percentage")
    accuracies = np.asarray(accuracies)
    plt.axvline(x=np.mean(accuracies),
                color='red',
                linestyle='--',
                label='mean')
    plt.savefig(MODEL_CHOICE.lower() + "_acc_dist.png")
Beispiel #7
0
    def get(self, request, format=None):
        all_keys = request.query_params.get('keyword', None)
        start_date = request.query_params.get('start_date', None)
        end_date = request.query_params.get('end_date', None)
        if all_keys[0] == 'h' and all_keys[1] == 't' and all_keys[
                2] == 't' and all_keys[3] == 'p':
            all_d = all_keys.split('/')
            all_keys = all_d[-2]
            if len(all_keys) == 1:
                all_keys = all_d[-1]
        print(all_keys)
        z = redditdata.objects.filter(keyword=all_keys).exists()
        if z == True:
            reddit = praw.Reddit(client_id='GZ4wXpp55Rzjqw',
                                 client_secret='nusWMTnlf0nLWOHWDcFcFi1RXQY',
                                 user_agent='data')
            print("fffsafasfasfsafasfasfsfasdsdasdasdasdasdsada", all_keys)
            data = redditdata.objects.filter(keyword=all_keys)
            print(data)
            global title, score, url, created, id_id, all_date, body
            for i in data:
                print("jai", i.score)
                print(i.keyword)
                print(i.all_date)
                print(i.body)

                title.append(i.title)
                score.append(i.score)
                url.append(i.url)
                created.append(i.created)
                id_id.append(i.id_id)
                all_date.append(i.all_date)
                body.append(i.body)

            print("scorescorescorescorescorescorescore", score, len(score),
                  len(url), len(created), len(id_id), len(body))

            topics_dict = {
                'Title': title,
                'Score': score,
                'id': id_id,
                'Url': url,
                'Created': created,
                'Date Time': all_date,
                'Body': body
            }
            df = pd.DataFrame(topics_dict)
            print(df)
            all_comments = []

            for ids in df.id:
                # print("idsidsidsidsidsidsidsidsidsidsids",ids)
                each_subreddit_comments = []
                for top_level_comment in reddit.submission(id=ids).comments:
                    each_subreddit_comments.append(top_level_comment.body)
                all_comments.append(each_subreddit_comments)
                each_subreddit_comments = []
            sid = SentimentIntensityAnalyzer()
            final_sentiments_list = []
            entered = 0
            for each_ in all_comments:
                sentiments_list = []
                for every_comment in each_:
                    entered = 1
                    polarity_dict = sid.polarity_scores(every_comment)
                    negative = polarity_dict['neg']
                    positive = polarity_dict['pos']
                    neutral = polarity_dict['neu']
                    if negative > positive and negative > neutral:
                        sentiments_list.append('negative')
                        continue
                    if positive > negative and positive > neutral:
                        sentiments_list.append('positive')
                        continue
                    if neutral > positive and neutral > positive:
                        sentiments_list.append('neutral')
                        continue
                    if positive == negative:
                        sentiments_list.append('neutral')
                        continue
                final_sentiments_list.append(sentiments_list)
            df['Comments'] = all_comments
            sen = []
            for i in final_sentiments_list:
                try:
                    sen.append(i[1])
                except:
                    sen.append("neutral")

            df['Sentiments'] = sen
            df2 = (df['Date Time'] > start_date) & (df['Date Time'] <=
                                                    end_date)
            df2 = df.loc[df2]
            print(df2)
            dic = {}
            li = []
            for Title, Score, i, Url, Cre, Body, all_date, Comments, Sentiments in zip(
                    df2["Title"], df2["Score"], df2["id"], df2["Url"],
                    df2["Created"], df2["Body"], df2["all_date"],
                    df2["Comments"], df2["Sentiments"]):
                li.append({
                    "Title": Title,
                    "Score": Score,
                    "id": i,
                    "Url": Url,
                    "Created": Cre,
                    "Body": Body,
                    "all_date": all_date,
                    "Comments": Comments,
                    "Sentiments": Sentiments
                })
            dic.update({"data": li})
            return Response(dic)
        else:
            print("This keyword is not exists in database please run post api")
            return Response(
                "This keyword(url) is not exists in database please run post api"
            )
Beispiel #8
0
def compute_nltk_polarity(msg_body):
    nltk.data.path.append("./nltk_data.zip/nltk_data")
    sid = SentimentIntensityAnalyzer()
    msg_body = sid.polarity_scores(msg_body)
    return msg_body
def sentiment(sentence):
    analyser = SentimentIntensityAnalyzer()
    ss = analyser.polarity_scores(sentence)
    #print ss.keys()
    return ss['compound']
Beispiel #10
0
def get_sentiment(sentences: list):
    return round(
        SentimentIntensityAnalyzer().polarity_scores(
            '. '.join(sentences))['compound'], 4)
 def open_spider(self, spider):
     #initialize sentiment analyzer
     self.analyzer = SentimentIntensityAnalyzer()
     self.analyzer.lexicon.update(lm_positive)
     self.analyzer.lexicon.update(lm_negative)
Beispiel #12
0
    def get(self, request, format=None):
        id = request.query_params.get('id', None)
        data = custmers.objects.get(id=id)
        all_keys = data.name
        start_date = data.start_date
        end_date = data.end_date
        start_date = request.query_params.get('start_date', None)
        end_date = request.query_params.get('end_date', None)
        if all_keys[0] == 'h' and all_keys[1] == 't' and all_keys[
                2] == 't' and all_keys[3] == 'p':
            all_d = all_keys.split('/')
            all_keys = all_d[-2]
            if len(all_keys) == 1:
                all_keys = all_d[-1]
        print(all_keys)
        z = redditdata.objects.filter(keyword=all_keys).exists()
        if z == True:
            reddit = praw.Reddit(client_id='GZ4wXpp55Rzjqw',
                                 client_secret='nusWMTnlf0nLWOHWDcFcFi1RXQY',
                                 user_agent='data')
            print("fffsafasfasfsafasfasfsfasdsdasdasdasdasdsada", all_keys)
            data = redditdata.objects.filter(keyword=all_keys)
            print(data)
            global title, score, url, created, id_id, all_date, body
            for i in data:
                print("jai", i.score)
                print(i.keyword)
                print(i.all_date)
                print(i.body)

                title.append(i.title)
                score.append(i.score)
                url.append(i.url)
                created.append(i.created)
                id_id.append(i.id_id)
                all_date.append(i.all_date)
                body.append(i.body)

            print("scorescorescorescorescorescorescore", score, len(score),
                  len(url), len(created), len(id_id), len(body))

            topics_dict = {
                'Title': title,
                'Score': score,
                'id': id_id,
                'Url': url,
                'Created': created,
                'Date Time': all_date,
                'Body': body
            }
            df = pd.DataFrame(topics_dict)
            print(df)
            all_comments = []

            for ids in df.id:
                # print("idsidsidsidsidsidsidsidsidsidsids",ids)
                each_subreddit_comments = []
                for top_level_comment in reddit.submission(id=ids).comments:
                    each_subreddit_comments.append(top_level_comment.body)
                all_comments.append(each_subreddit_comments)
                each_subreddit_comments = []
            sid = SentimentIntensityAnalyzer()
            final_sentiments_list = []
            entered = 0
            for each_ in all_comments:
                sentiments_list = []
                for every_comment in each_:
                    entered = 1
                    polarity_dict = sid.polarity_scores(every_comment)
                    negative = polarity_dict['neg']
                    positive = polarity_dict['pos']
                    neutral = polarity_dict['neu']
                    if negative > positive and negative > neutral:
                        sentiments_list.append('negative')
                        continue
                    if positive > negative and positive > neutral:
                        sentiments_list.append('positive')
                        continue
                    if neutral > positive and neutral > positive:
                        sentiments_list.append('neutral')
                        continue
                    if positive == negative:
                        sentiments_list.append('neutral')
                        continue
                final_sentiments_list.append(sentiments_list)
            df['Comments'] = all_comments
            sen = []
            for i in final_sentiments_list:
                try:
                    sen.append(i[1])
                except:
                    sen.append("neutral")

            df['Sentiments'] = sen
            df2 = (df['Date Time'] > start_date) & (df['Date Time'] <=
                                                    end_date)
            df2 = df.loc[df2]
            print(df2)
            dic = {}
            li = []
            li.append([
                "Title", "Score", "id", "Url", "Created", "Body", "Date Time",
                "Comments", "Sentiments"
            ])
            for Titlez, Scorez, iz, Urlz, Crez, Bodyz, all_datz, Commentsz, Sentimentsz in zip(
                    df2["Title"], df2["Score"], df2["id"], df2["Url"],
                    df2["Created"], df2["Body"], df2["Date Time"],
                    df2["Comments"], df2["Sentiments"]):
                li.append([
                    Titlez, Scorez, iz, Urlz, Crez, Bodyz, all_datz, Commentsz,
                    Sentimentsz
                ])

            print(len(li),
                  "cccccccccccccc111111111111111111111111111111111111111")
            filename = "reddit_data.csv"
            fp = StringIO()
            response = HttpResponse(content_type='text/csv')
            response[
                'Content-Disposition'] = 'attachment; filename="{}"'.format(
                    filename)
            writer = csv.writer(response)
            for row in li:
                writer.writerow(row)
            return response
        else:
            print("This keyword is not exists in database please run post api")
            return Response(
                "This keyword(url) is not exists in database please run post api"
            )
Beispiel #13
0
 def __init__(self,text):
     self.text = text
     self.ytSenti = SentimentIntensityAnalyzer()
Beispiel #14
0
    def textMinin(self):
        plt.clf()
        plik = p1.onOpen(self)
        review = pd.read_csv(plik)

        # tworzenie oznaczenia recenzji o zlym  wydzwieku (ocena < 5)
        review["is_bad_review"] = review["recomend"].apply(
            lambda x: 1 if x == "Not Recommended" else 0)
        # wybieranie tylko potrzebych kolumn
        review = review[["review", "is_bad_review"]]
        # zamiana danych w kolumnie "review" na string
        review['review'] = review['review'].astype(str)

        review.head()

        # Podzial danych na probke
        #review = review.sample(frac = 0.3, replace = False, random_state=42)

        # obrabianie danych
        def get_wordnet_pos(pos_tag):
            if pos_tag.startswith('J'):
                return wordnet.ADJ
            elif pos_tag.startswith('V'):
                return wordnet.VERB
            elif pos_tag.startswith('N'):
                return wordnet.NOUN
            elif pos_tag.startswith('R'):
                return wordnet.ADV
            else:
                return wordnet.NOUN

        def clean_text(text):
            # male litery
            text = text.lower()
            # tokenizacja i usuwanie interpunkcji
            text = [word.strip(string.punctuation) for word in text.split(" ")]
            # usuwanie slow zawierajacych cyfry
            text = [
                word for word in text if not any(c.isdigit() for c in word)
            ]
            # usuwanie "stop" slow ('the', 'a' ,'this')
            stop = stopwords.words('english')
            text = [x for x in text if x not in stop]
            # usuwanie pustych tokenow
            text = [t for t in text if len(t) > 0]
            # oznaczanie slow POS (rzeczownik,przymiotnik,itd)
            pos_tags = pos_tag(text)
            # lemmanizacja tekstu (odmieniona forma do bezokolicznika, jesli istnieje)
            text = [
                WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1]))
                for t in pos_tags
            ]
            # usuwanie slow jednoliterowych
            text = [t for t in text if len(t) > 1]
            # fuzja tekstu
            text = " ".join(text)
            return (text)

        review["review_clean"] = review["review"].apply(
            lambda x: clean_text(x))

        # uzycie Vader do sprawdzenia nastroju slow do odroznienia negatywnych od pozytywnych
        sid = SentimentIntensityAnalyzer()
        review["sentiments"] = review["review"].apply(
            lambda x: sid.polarity_scores(x))
        review = pd.concat([
            review.drop(['sentiments'], axis=1), review['sentiments'].apply(
                pd.Series)
        ],
                           axis=1)

        # liczba liter
        review["nb_chars"] = review["review"].apply(lambda x: len(x))

        # liczba slow
        review["nb_words"] = review["review"].apply(
            lambda x: len(x.split(" ")))

        # reprezentacja wektorowa kazdej recenzji

        documents = [
            TaggedDocument(doc, [i]) for i, doc in enumerate(
                review["review_clean"].apply(lambda x: x.split(" ")))
        ]

        # trening Doc2Vec
        model = Doc2Vec(documents,
                        vector_size=5,
                        window=2,
                        min_count=1,
                        workers=4)

        # przetwarzanie danych do danych wektorowych (Wymagane w Doc2Vec)
        doc2vec_df = review["review_clean"].apply(
            lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
        doc2vec_df.columns = [
            "doc2vec_vector_" + str(x) for x in doc2vec_df.columns
        ]
        review = pd.concat([review, doc2vec_df], axis=1)

        # dodawanie wartosci TF-IDF dla kazdego slowa
        tfidf = TfidfVectorizer(min_df=10)
        tfidf_result = tfidf.fit_transform(review["review_clean"]).toarray()
        tfidf_df = pd.DataFrame(tfidf_result,
                                columns=tfidf.get_feature_names())
        tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
        tfidf_df.index = review.index
        review = pd.concat([review, tfidf_df], axis=1)

        # pokazanie dystrybucji procentowej dobrych do zlych recenzji
        f = open("wyniki recenzji.txt", "w")
        print('dystrybucja dobrych do zlych recenzji:')
        print(review["is_bad_review"].value_counts(normalize=True))
        f.write('dystrybucja dobrych do zlych recenzji:\n')
        f.write(str(review["is_bad_review"].value_counts(normalize=True)))
        f.close()

        def show_wordcloud(data, title=None):
            wordcloud = WordCloud(background_color='white',
                                  max_words=200,
                                  max_font_size=40,
                                  scale=3,
                                  random_state=42).generate(str(data))

            fig = plt.figure(1, figsize=(20, 20))
            plt.axis('off')
            if title:
                fig.suptitle(title, fontsize=20)
                fig.subplots_adjust(top=2.3)
            plt.imshow(wordcloud)
            self.canvas.draw()

        show_wordcloud(review["review"])
        # wypisanie 10 najbardziej pozytywnych recenzji
        print('wypisanie 10 najbardziej pozytywnych recenzji:')

        print(review[review["nb_words"] >= 5].sort_values(
            "pos", ascending=False)[["review", "pos"]].head(10))
        f = open("wyniki recenzji.txt", "a")
        f.write('\nwypisanie 10 najbardziej pozytywnych recenzji:\n')
        f.write(
            str(review[review["nb_words"] >= 5].sort_values(
                "pos", ascending=False)[["review", "pos"]].head(10)))
        f.close()
Beispiel #15
0
def sentiment_analysis(text, comma_threshold=1, interval_threshold=15):
    result = {}
    text = check_comma(text, comma_threshold, interval_threshold)
    text = text_preprocess(text)
    is_suggestion = False
    is_negative = False
    try:
        language = language_detect(text)
        if language == "en":
            sentences = sent_tokenize(text)
            analyzer = SentimentIntensityAnalyzer()
            for sentence in sentences:
                for keyword in suggestion_en:
                    is_suggestion = keyword_match(keyword, sentence)
                    is_suggestion = False
                    if is_suggestion:
                        result[sentence] = "SUGGESTION"
                        break
                if not is_suggestion:
                    score = analyzer.polarity_scores(sentence)
                    if (score['neg'] > 0.09 and score["compound"] < 0.1
                        ) or score["compound"] < -0.3:
                        result[
                            sentence] = "NEGATIVE,             neg: %s, pos: %s, neu: %s, compound: %s" % (
                                score["neg"], score["pos"], score["neu"],
                                score["compound"])
                    elif score["compound"] > 0.3 and score['pos'] > 0.17:
                        result[
                            sentence] = "POSITIVE,             neg: %s, pos: %s, neu: %s, compound: %s" % (
                                score["neg"], score["pos"], score["neu"],
                                score["compound"])
                    else:
                        result[
                            sentence] = "NEUTRAL,              neg: %s, pos: %s, neu: %s, compound: %s" % (
                                score["neg"], score["pos"], score["neu"],
                                score["compound"])
            return result
        if language == "fr":
            sentences = sent_tokenize(text, language="french")
            analyzer = Blobber(pos_tagger=PatternTagger(),
                               analyzer=PatternAnalyzer())
            for sentence in sentences:
                for keyword in suggestion_fr:
                    is_suggestion = keyword_match(keyword, sentence)
                    if is_suggestion:
                        result[sentence] = "SUGGESTION"
                        break
                for keyword in neg_fr:
                    is_negative = keyword_match(keyword, sentence)
                    if is_negative:
                        result[sentence] = "NEGATIVE"
                        break
                if not is_suggestion and not is_negative:
                    score = analyzer(sentence).sentiment
                    if (score[0] < 0
                            and score[1] > 0) or (score[0] < 0.1
                                                  and score[1] > 0.25):
                        result[
                            sentence] = "NEGATIVE              sentiment: %s, subjectivity %s" % (
                                score[0], score[1])
                    elif score[0] > 0.2:
                        result[
                            sentence] = "POSITIVE              sentiment: %s, subjectivity %s" % (
                                score[0], score[1])
                    else:
                        result[
                            sentence] = "NEUTRAL               sentiment: %s, subjectivity %s" % (
                                score[0], score[1])
            return result
    except:
        result["Message"] = "MEANINGLESS"
        return result
def score():

    cursor.execute(updateNulls)  #safety check to catch bad text scraping
    sid = SentimentIntensityAnalyzer()  #create sentiment analyzer object

    numTopics = 3  #how many topics would you like the model to find
    numWords = 7  #how many words would you like to view
    passes = 20  #how many times do you want to go over the data

    #a list of hardcoded words to ignore - created from analyzing previous trials
    stopWords = [
        'the', 'like', 'ya', 'wanna', 'know', 'let', 'got', '4', 'yeah', 'ooh',
        'yo', 'went', 'ric', '2', 'need', 'seen', 'word', 'huh', 'said'
        'big', 'whatchu', 'el', 'gonna', 'cause', 'things', 'gon', 'thing',
        'letting', 'goes', 'tell', 'gave', 'great', '10', 'uh', '25', 'said',
        'stuff', 'tho', 'gotta', '100', 'al', 'lot', 'bout', 'boi', 'dem',
        'oh', 'ooooahahh', '80', 'ig', 'ev', 'ayy', '85', 'vro', 'ok', 'ha',
        'tings', 'nah', 'em', 'wit', 'mi', '6', '21', 'la', 'x2', 'ay', 'du',
        'ba', 'im', 'ahhhh', '7', '12', 'yaaaaa', 'ee', 'waaaaaaa', 'mmm',
        'na', 'buh', 'ga', 'da', 'iii', '47', 'ol', 'une', '0', '1', '2015'
    ]

    cursor.execute(queryGrabSongs)
    data = cursor.fetchall()  #grab all of the song data with lyrics
    print("Percent of total saved songs analyzed: ",
          str(round((len(data) / data[-1][0]), 3) * 100) + '%')

    #for each song with lyrics, individually score and update the vader column then grab the topics using an LDA model for each unique corpora
    for row in data:
        #safety check to make sure there are lyrics
        if row[5]:
            lyrics = row[5]
            rowId = row[0]
            #score the song lyrics grabbed and update the vader score column in the main table
            ss = sid.polarity_scores(lyrics)
            cursor.execute(updateSongVaders, (ss.get('compound'), rowId))

            lines = lyrics.split(
                '\n'
            )  #create a list of lines in song lyrics for the bag of words

            #makes a list of the indiviudal words for the dict and corpus
            words = [[
                word.strip() for word in line.lower().split()
                if word not in STOPWORDS and word not in stopWords
                and word.isalnum()
            ] for line in lines]

            #dict and corpus from list of words
            dictionary = corpora.Dictionary(words)
            corpus = [dictionary.doc2bow(text) for text in words]

            try:
                #use lda on bag of words to find topics
                lda = LdaModel(corpus,
                               id2word=dictionary,
                               num_topics=numTopics,
                               passes=passes)

                #for each of the expected 3 topics, score the topic words as well as insert the topic words and vader score
                for topic in lda.print_topics(num_words=numWords):
                    listOfTerms = topic[1].split('+')
                    wordList = []
                    for term in listOfTerms:
                        listItems = term.split('*')
                        wordList.append(listItems[1].replace('"', ''))
                    ss2 = sid.polarity_scores(' '.join(wordList))
                    cursor.execute(
                        insertTopics,
                        (row[2], wordList[0], wordList[1], wordList[2],
                         wordList[3], wordList[4], wordList[5], wordList[6],
                         ss2.get('compound')))
            except:  #safety check to make sure techno and EDM songs with overly repetitve lyrics are not added
                print('Bag of Words too small')

    print('-----DONE!-----')

    print(len(data), 'Scores and Topics Added')
def train_test():

    all_data, all_labels = extract_csv('../data/comments.csv', LABELS,
                                       CATEGORY)
    all_data = np.asarray(all_data)
    all_labels = np.asarray(all_labels)

    # dictionary to count top instances of words
    aggregator = []
    for i in range(len(LABELS)):
        aggregator.append(dict())

    randomization_scheme = np.arange(len(all_data))
    np.random.shuffle(randomization_scheme)
    randomized_data = all_data[randomization_scheme]
    randomized_labels = all_labels[randomization_scheme]

    num_trials = 10

    X_trains, y_trains, X_vals, y_vals = split_folds(
        randomized_data.reshape((len(all_data), 1)), randomized_labels,
        num_trials)

    all_predictions = None

    for trial in range(num_trials):

        train_messages = X_trains[trial]
        train_labels = y_trains[trial]
        val_messages = X_vals[trial]
        val_labels = y_vals[trial]

        dictionary = create_dictionary(train_messages)
        train_matrix = transform_text(train_messages, dictionary)
        val_matrix = transform_text(val_messages, dictionary)

        if MODEL_CHOICE is "LOGREG":

            logreg = LogisticRegression()
            logreg.fit(train_matrix, train_labels)

            #logistic_regresion_model = fit_logistic_regresion_model(train_matrix, train_labels, labels)
            #logistic_regresion_predictions = predict_from_logistic_regresion_model(logistic_regresion_model, val_matrix, True, val_messages)
            #logistic_regresion_predictions = predict_from_logistic_regresion_model(logistic_regresion_model, val_matrix)
            logistic_regresion_predictions = logreg.predict(val_matrix)
            logistic_regresion_accuracy = np.mean(
                logistic_regresion_predictions == val_labels)
            print(
                'Logistic Regression had an accuracy of {} on the testing set'.
                format(logistic_regresion_accuracy))
            #get_top_five_logistic_regresion_words(logreg, dictionary, aggregator)
            #write_csv_mixed(val_messages, val_labels, logistic_regresion_predictions, trial, CONFUSION_MATRIX_OPTION)

        elif MODEL_CHOICE is "NAIVE":
            naive_bayes_model = fit_naive_bayes_model(train_matrix,
                                                      train_labels, LABELS)
            naive_bayes_predictions = predict_from_naive_bayes_model(
                naive_bayes_model, val_matrix)
            naive_bayes_accuracy = np.mean(
                naive_bayes_predictions == val_labels)
            print(
                'Naive Bayes had an accuracy of {} on the testing set'.format(
                    naive_bayes_accuracy))
            get_top_five_logistic_regresion_words(naive_bayes_model,
                                                  dictionary, aggregator)
            #write_csv_mixed(val_messages, val_labels, naive_bayes_predictions, trial, CONFUSION_MATRIX_OPTION)

        elif MODEL_CHOICE is "OFFSHELF":

            sid = SentimentIntensityAnalyzer()
            converted = [x[0].decode('utf-8') for x in val_messages]

            sid_predictions = predict_from_off_shelf_model(sid, converted)
            sid_accuracy = np.mean(sid_predictions == val_labels)
            print('Off The Shelf had an accuracy of {} on the testing set'.
                  format(sid_accuracy))
            #write_csv_mixed(val_messages, val_labels, sid_predictions, trial, CONFUSION_MATRIX_OPTION)

        # Just do this once
        if trial == 0:
            all_matrix = transform_text(all_data, dictionary)
            if MODEL_CHOICE is "LOGREG":
                all_predictions = logreg.predict(all_matrix)
            elif MODEL_CHOICE is "NAIVE":
                all_predictions = predict_from_naive_bayes_model(
                    naive_bayes_model, all_matrix)
            elif MODEL_CHOICE is "OFFSHELF":
                all_predictions = predict_from_off_shelf_model(
                    sid, randomized_data)

    plot_frequent_words(LABELS, aggregator)
def calculate_sentiment(phrase):

    # Worst hotel ive stayed in. - The lock housing was exposed meaning it wasnt difficult to break into our room. - No safety deposit boxes in rooms. - Hot water constantly running out. -  - Virtually no cooking utensils, making a basic task such as hard boiling an egg extremely difficult.

    df = pd.read_csv(f'{main_path}/data/adjectives_lexicon.csv',
                     delimiter=",",
                     usecols=['word', 'sentiment'])
    print(df)

    tokens = nltk.word_tokenize(phrase.lower())
    print(tokens)

    print()
    print('=====================')
    print('sentiment lexicon - word by word')
    print('=====================')

    sentim_scores = []
    for key in tokens:
        try:
            x = df[df.word.str.contains(r'\b{}\b'.format(key), na=False)]
            print(x)
            print(x.iloc[0])
            sentim_scores.append(list(x.values[0])[1])
        except:
            sentim_scores.append(0.1)
    #
    print(sentim_scores)

    mean_sent_score = mean(sentim_scores)
    print(mean_sent_score)

    if mean_sent_score > 0:
        overall_sentiment = 1
    else:
        overall_sentiment = -1

    print(overall_sentiment)

    tknzr = TweetTokenizer()

    def lemmatize(text):
        '''lemmatizes text for the given pos tags - NN,VB,JJ'''

        wnl = WordNetLemmatizer()
        for word, tag in pos_tag(tknzr.tokenize(text)):

            if tag.startswith("NN"):
                yield wnl.lemmatize(word, pos='n')
            elif tag.startswith('VB'):
                yield wnl.lemmatize(word, pos='v')
            elif tag.startswith('JJ'):
                yield wnl.lemmatize(word, pos='a')
            else:
                yield word

    print()
    print('=====================')
    print('Textblob')
    print('=====================')

    from textblob.sentiments import NaiveBayesAnalyzer
    textblob_sent_score = TextBlob(doc, analyzer=NaiveBayesAnalyzer()).polarity
    print(textblob_sent_score)

    if textblob_sent_score > 1:
        overall_textblob_sentiment = 1
    elif textblob_sent_score < 1:
        overall_textblob_sentiment = -1
    else:
        overall_textblob_sentiment = 0

    print(overall_textblob_sentiment)

    print()
    print('=====================')
    print('Vader Sentiment Intensity Analyzer')
    print('=====================')

    sid = SentimentIntensityAnalyzer()

    vader_sent_score = sid.polarity_scores(doc)
    print(vader_sent_score)

    if vader_sent_score['compound'] > 1:
        overall_vader_sentiment = 1
    elif vader_sent_score['compound'] < 1:
        overall_vader_sentiment = -1
    else:
        overall_vader_sentiment = 0

    print(overall_vader_sentiment)

    print()
    print('=====================')
    print('Afinn Analyzer')
    print('=====================')

    # https://github.com/fnielsen/afinn
    afinn = Afinn()
    afinn_sent_score = afinn.score(doc)
    print(afinn_sent_score)

    if afinn_sent_score > 0:
        overall_afinn_sentiment = 1
    else:
        overall_afinn_sentiment = -1

    print(overall_afinn_sentiment)

    print()
    print('=====================')
    print('ML model')
    print('=====================')

    trained_model_score = trained_sentiment_model([doc])

    if trained_model_score == 'positive':
        overall_model_sentiment = -1
    else:
        overall_model_sentiment = -1

    print('overall_model_sentiment')
    print(overall_model_sentiment)

    prob_sentiment = mean([
        overall_sentiment, overall_vader_sentiment, overall_afinn_sentiment,
        overall_textblob_sentiment, overall_model_sentiment
    ])

    print()
    print('prob_sentiment:')
    print(prob_sentiment)

    if prob_sentiment < 0:
        calculated_sentiment = 'neg'
    else:
        calculated_sentiment = 'pos'
    print(calculated_sentiment)
def get_mode_sentiment():
    all_data, all_labels = extract_csv('../data/comments.csv', LABELS,
                                       CATEGORY)
    all_data = np.asarray([[x] for x in all_data], dtype="S1000")
    all_labels = np.asarray(all_labels)

    predictions = []
    for _ in range(len(all_labels)):
        predictions.append([])

    trial = 0

    not_done = True

    while not_done:

        randomization_scheme = np.arange(len(all_data))
        np.random.shuffle(randomization_scheme)
        useful = False
        for i in range(len(all_data) // VAL_SPLIT):
            if len(predictions[randomization_scheme[i]]) < MAX_MODE:
                useful = True
                break
        if useful:
            trial += 1
            if (trial % 10) == 0:
                print("Trial:", trial)

            randomized_data = all_data[randomization_scheme]
            randomized_labels = all_labels[randomization_scheme]

            train_messages = randomized_data[len(all_data) // VAL_SPLIT:]
            train_labels = randomized_labels[len(all_data) // VAL_SPLIT:]
            val_messages = randomized_data[:len(all_data) // VAL_SPLIT]
            val_labels = randomized_labels[:len(all_data) // VAL_SPLIT]

            dictionary = create_dictionary(train_messages)
            train_matrix = transform_text(train_messages, dictionary)
            val_matrix = transform_text(val_messages, dictionary)

            guesses = None

            if MODEL_CHOICE is "LOGREG":

                logreg = LogisticRegression()
                logreg.fit(train_matrix, train_labels)

                guesses = logreg.predict(val_matrix)

            elif MODEL_CHOICE is "NAIVE":
                naive_bayes_model = fit_naive_bayes_model(
                    train_matrix, train_labels, LABELS)
                guesses = predict_from_naive_bayes_model(
                    naive_bayes_model, val_matrix)

            elif MODEL_CHOICE is "OFFSHELF":

                sid = SentimentIntensityAnalyzer()
                converted = [x[0].decode('utf-8') for x in val_messages]

                guesses = predict_from_off_shelf_model(sid, converted)

            for i in range(len(guesses)):
                if len(predictions[randomization_scheme[i]]) < MAX_MODE:
                    predictions[randomization_scheme[i]].append(guesses[i])

            total = 0
            for i in range(len(predictions)):
                total += len(predictions[i])
            if total == len(all_data) * MAX_MODE:
                not_done = False

    total_all_sentiment = np.zeros(len(LABELS))
    total_mode_sentiment = np.zeros(len(LABELS))
    per_comment_proportion = np.zeros((len(predictions), len(LABELS)))
    mode_sentiment = np.zeros(len(predictions))
    for i in range(len(predictions)):
        curr_sentiment = np.zeros(len(LABELS))
        for label in predictions[i]:
            curr_sentiment[label - 1] += 1
            total_all_sentiment[label - 1] += 1
        mode_sentiment[i] = np.argmax(curr_sentiment) + 1
        per_comment_proportion[i] = curr_sentiment / sum(curr_sentiment)
        total_mode_sentiment[int(mode_sentiment[i]) - 1] += 1
    total_all_sentiment = total_all_sentiment / sum(total_all_sentiment)
    total_mode_sentiment = total_mode_sentiment / sum(total_mode_sentiment)
    print("Total proportion of all sentiments:", total_all_sentiment)
    print("Total proportion of sentiment by mode:", total_mode_sentiment)
    print("Total Distance:",
          np.sum(np.abs(total_all_sentiment - total_mode_sentiment)))

    my_accuracy = np.mean(mode_sentiment == all_labels)
    print("This is the accuracy:", my_accuracy)
    mimic_qualtrics("../results/mimic_results2.csv",
                    "../data/all_survey_responses.csv", "../data/comments.csv",
                    all_data.flatten(), all_labels, mode_sentiment.astype(int))
def analyzeStocks():
    html_tables = {}

    # For every table in the datasets folder...
    for table_name in os.listdir('datasets'):
        #this is the path to the file.
        table_path = f'datasets/{table_name}'
        # Open as a python file in read-only mode
        table_file = open(table_path, 'r')
        # Read the contents of the file into 'html'
        html = BeautifulSoup(table_file)

        html_table = html.find(id="news-table")
        # Adding the table to our dictionary
        html_tables[table_name] = html_table

    # Read one single day of headlines
    tsla = html_tables['tsla_22sep.html']
    # Get all the table rows tagged in HTML with <tr> into 'tesla_tr'
    tsla_tr = tsla_tr = tsla.findAll('tr')

    # For each row...
    for i, table_row in enumerate(tsla_tr):

        link_text = table_row.a.get_text()

        data_text = table_row.td.get_text()
        # Print the count
        print(f'{i}:')
        # Print the contents of 'link_text' and 'data_text'
        print(link_text)
        print(data_text)
        # The following exits the loop after three rows to prevent spamming the notebook.
        if i == 3:
            break

    # Hold the parsed news into a list
    parsed_news = []
    # Iterate through the news
    for file_name, news_table in html_tables.items():
        # Iterate through all tr tags in 'news_table'
        for x in news_table.findAll('tr'):

            text = x.get_text()
            headline = x.a.get_text()

            date_scrape = x.td.text.split()

            if len(date_scrape) == 1:
                time = date_scrape[0]
            else:
                date = date_scrape[0]
                time = date_scrape[1]

            # Extract the ticker from the file name, get the string up to the 1st '_'
            ticker = file_name.split('_')[0]
            # Append ticker, date, time and headline as a list to the 'parsed_news' list
            parsed_news.append([ticker, date, time, headline])

    # New words and values
    new_words = {
        'crushes': 10,
        'beats': 5,
        'misses': -5,
        'trouble': -10,
        'falls': -100,
    }
    # Instantiate the sentiment intensity analyzer with the existing lexicon
    vader = SentimentIntensityAnalyzer()
    # Update the lexicon
    vader.lexicon.update(new_words)

    columns = ['ticker', 'date', 'time', 'headline']

    scored_news = pd.DataFrame(parsed_news, columns=columns)
    # Iterate through the headlines and get the polarity scores
    scores = [
        vader.polarity_scores(headline)
        for headline in scored_news.headline.values
    ]

    scores_df = pd.DataFrame(scores)
    # Join the DataFrames
    scored_news = pd.concat([scored_news, scores_df], axis=1)
    # Convert the date column from string to datetime
    scored_news['date'] = pd.to_datetime(scored_news.date).dt.date

    plt.style.use("fivethirtyeight")

    # Group by date and ticker columns from scored_news and calculate the mean
    mean_c = scored_news.groupby(['date', 'ticker']).mean()

    mean_c = mean_c.unstack(level=1)

    mean_c = mean_c.xs('compound', axis=1)

    mean_c.plot.bar()
    plt.savefig("plot1.png")

    # Analyzing just one day of stock trends

    # Set the index to ticker and date
    scored_news_clean = scored_news.drop_duplicates(
        subset=['ticker', 'headline'])
    single_day = scored_news_clean.set_index(['ticker', 'date'])

    single_day = single_day.loc['fb']
    # Selecting the 3rd of January of 2019
    single_day = single_day.loc['2019-01-03']
    # Convert the datetime string to just the time
    single_day['time'] = pd.to_datetime(single_day['time'])
    single_day['time'] = single_day.time.dt.time

    single_day = single_day.set_index('time')
    # Sort it
    single_day = single_day.sort_index(ascending=True)

    # Visualizing sentiment for that day
    TITLE = "Negative, neutral, and positive sentiment for FB on 2019-01-03"
    COLORS = ["red", "orange", "green"]
    # Drop the columns that aren't useful for the plot
    plot_day = single_day.drop(['headline', 'compound'], axis=1)
    # Change the column names to 'negative', 'positive', and 'neutral'
    plot_day.columns = ["negative", "positive", "neutral"]

    plot_day.plot(kind='bar').legend(bbox_to_anchor=(1, 1))
    plt.savefig("plot2.png")
Beispiel #21
0
for ticker, news_table in news_tables.items():

    for row in news_table.findAll('tr'):

        title = row.a.text
        date_data = row.td.text.split(' ')

        if len(date_data) == 1:
            time = date_data[0]
        else:
            date = date_data[0]
            time = date_data[1]

        parsed_date.append([ticker, date, time, title])

df = pd.DataFrame(parsed_date, columns=['ticker', 'date', 'time', 'title'])

vader = SentimentIntensityAnalyzer()

f = lambda title: vader.polarity_scores(title)['compound']
df['compound'] = df['title'].apply(f)
df['date'] = pd.to_datetime(df.date).dt.date

plt.figure(figsize=(10, 8))

mean_df = df.groupby(['ticker', 'date']).mean()
mean_df = mean_df.unstack()
mean_df = mean_df.xs('compound', axis="columns").transpose()
mean_df.plot(kind='bar')
plt.show()
Beispiel #22
0
def news_sentiment(symbol):
    # Import libraries
    import pandas as pd
    from bs4 import BeautifulSoup
    import matplotlib.pyplot as plt
    from urllib.request import urlopen, Request
    from nltk.sentiment.vader import SentimentIntensityAnalyzer

    # Parameters
    n = 5  #the # of article headlines displayed per ticker
    tickers = [symbol]

    # Get Data
    finwiz_url = 'https://finviz.com/quote.ashx?t='
    news_tables = {}

    for ticker in tickers:
        url = finwiz_url + ticker
        req = Request(url=url, headers={'user-agent': 'my-app/0.0.1'})
        resp = urlopen(req)
        html = BeautifulSoup(resp, features="lxml")
        news_table = html.find(id='news-table')
        news_tables[ticker] = news_table

    try:
        for ticker in tickers:
            df = news_tables[ticker]
            df_tr = df.findAll('tr')

            # print ('\n')
            # print ('Recent News Headlines for {}: '.format(ticker))

            for i, table_row in enumerate(df_tr):
                a_text = table_row.a.text
                td_text = table_row.td.text
                td_text = td_text.strip()
                # print(a_text,'(',td_text,')')
                if i == n - 1:
                    break
    except KeyError:
        pass

    # Iterate through the news
    parsed_news = []
    for file_name, news_table in news_tables.items():
        for x in news_table.findAll('tr'):
            text = x.a.get_text()
            date_scrape = x.td.text.split()

            if len(date_scrape) == 1:
                time = date_scrape[0]

            else:
                date = date_scrape[0]
                time = date_scrape[1]

            ticker = file_name.split('_')[0]

            parsed_news.append([ticker, date, time, text])

    # Sentiment Analysis
    analyzer = SentimentIntensityAnalyzer()

    columns = ['Ticker', 'Date', 'Time', 'Headline']
    news = pd.DataFrame(parsed_news, columns=columns)
    scores = news['Headline'].apply(analyzer.polarity_scores).tolist()

    df_scores = pd.DataFrame(scores)
    news = news.join(df_scores, rsuffix='_right')

    # View Data
    news['Date'] = pd.to_datetime(news.Date).dt.date

    unique_ticker = news['Ticker'].unique().tolist()
    news_dict = {
        name: news.loc[news['Ticker'] == name]
        for name in unique_ticker
    }

    values = []
    for ticker in tickers:
        dataframe = news_dict[ticker]
        dataframe = dataframe.set_index('Ticker')
        # dataframe = dataframe.drop(columns = ['Headline'])
        # print ('\n')
        # print (dataframe.head())
        mean = round(dataframe['compound'].mean(), 2)
        values.append(mean)

    # df = pd.DataFrame(list(zip(tickers, values)), columns =['Ticker', 'Mean Sentiment'])
    # df = df.set_index('Ticker')
    # df = df.sort_values('Mean Sentiment', ascending=False)

    return dataframe
Beispiel #23
0
 def __init__(self, response_time=False):
     if response_time:
         self.waiting_response = {}
     self.response_time = response_time
     self.sentiment_analyzer = SentimentIntensityAnalyzer()
Beispiel #24
0
def sentiment_of_lyrics(lyrics):
    if lyrics != "":
        sid = SentimentIntensityAnalyzer()
        return sid.polarity_scores(lyrics)
    else:
        return {'neg': 'none', 'neu': 'none', 'pos': 'none', 'compound': 'none' }
Beispiel #25
0
 def __init__(self, model_file: str = None) -> None:
     super().__init__()
     self.vader_sia = SentimentIntensityAnalyzer()
def make_graphs():
    fs = FileSystemStorage()
    nlp = spacy.load("en_core_web_sm")
    path = settings.MEDIA_ROOT
    mentions = pd.read_csv((path + "/csv/comments_and_mentions.csv"), encoding="utf8")
    mentions.head()
    text = mentions['content.text'].unique()
    text_string = np.array2string(text)

    nltk.download('punkt')
    nltk.download('vader_lexicon')
    nltk.download('stopwords')

    words = nltk.tokenize.word_tokenize(text_string)
    wordList = []

    sentences = nltk.tokenize.sent_tokenize(text_string)

    stop_words = nltk.corpus.stopwords.words('english')

    punctuations = list(string.punctuation)
    # print(punctuations)
    for i in range(len(words)):
        words[i] = words[i].lower()

    for word in words:  # iterate over word_list
        if word in nltk.corpus.stopwords.words('english'):
            try:
                while True:
                    words.remove(word)
            except ValueError:
                pass
            wordList.append(word)

    for punctuation in punctuations:
        if punctuation in words:
            try:
                while True:
                    words.remove(punctuation)
            except ValueError:
                pass
            wordList.append(punctuation)

    clean_text = []

    for i in text:
        doc = nlp(i)

        for token in doc:
            if token.is_alpha and not token.is_stop:
                clean = {
                    'text': token.text,
                    'lemma': token.lemma_,
                    'part_of_speech': token.pos_,
                    'pos_tag': token.tag_}
                clean_text.append(clean)

    df = pd.DataFrame(clean_text)

    # saving the dataframe
    df.to_csv(os.path.join(path, 'csv/nlu-text.csv'))

    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context

    from nltk.sentiment.vader import SentimentIntensityAnalyzer

    sid = SentimentIntensityAnalyzer()

    text_df = pd.DataFrame(data=text, columns=['text'])

    text_df['scores'] = text_df['text'].apply(lambda comment: sid.polarity_scores(comment))

    text_df['scores'].head()

    text_df['compound'] = text_df['scores'].apply(lambda score_dict: score_dict['compound'])

    text_df.head()

    text_df['comp_score'] = text_df['compound'].apply(lambda c: 'pos' if c >= 0 else 'neg')

    text_df.head()

    neg_text = text_df.where(text_df['comp_score'] == 'neg').dropna()
    neg_text.head()

    pos_text = text_df.where(text_df['comp_score'] == 'pos').dropna()
    pos_text.head()

    text_df.to_csv(os.path.join(path, 'csv/sentiment-text.csv'), encoding='utf8')

    stopwords = set(STOPWORDS)
    stopwords.add("Philip")

    # Generate a word cloud image
    wordcloud = WordCloud(background_color="white", stopwords=stopwords).generate(df['text'].to_string())

    # Display the generated image:
    # the matplotlib way:
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title("All Conversation Words")
    plt.axis("off")
    if fs.exists(os.path.join(path, "graphs/sent_analysis_all_words.png")):  # if file exists, overwrite with new file
        os.remove(os.path.join(settings.MEDIA_ROOT, os.path.join(path, "graphs/sent_analysis_all_words.png")))
    plt.savefig(path + "/graphs/sent_analysis_all_words.png")

    # lower max_font_size
    wordcloud = WordCloud(background_color="white", max_font_size=40, stopwords=stopwords, max_words=50).generate(
        df['text'].to_string())
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title("All Conversation Words (Smaller Font)")
    plt.axis("off")
    if fs.exists(os.path.join(path, "graphs/sent_analysis_all_words_small.png")):
        os.remove(os.path.join(settings.MEDIA_ROOT, os.path.join(path, "graphs/sent_analysis_all_words_small.png")))
    plt.savefig(os.path.join(path, "graphs/sent_analysis_all_words_small.png"))

    wordcloud = WordCloud(background_color="white", max_words=50).generate(
        df.where(df['part_of_speech'] == 'PROPN').dropna()['text'].to_string())
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title("Pronouns Used")
    plt.axis("off")
    if fs.exists(os.path.join(path, "graphs/sent_analysis_pronouns.png")):
        os.remove(os.path.join(settings.MEDIA_ROOT, os.path.join(path, "graphs/sent_analysis_pronouns.png")))
    plt.savefig(os.path.join(path, "graphs/sent_analysis_pronouns.png"))

    clean_neg_text = []

    for i in text:
        doc = nlp(i)

        for token in doc:
            if token.is_alpha and not token.is_stop:
                clean = {
                    'text': token.text,
                    'lemma': token.lemma_,
                    'part_of_speech': token.pos_,
                    'pos_tag': token.tag_}
                clean_neg_text.append(clean)

    neg_text = pd.DataFrame(clean_neg_text)
    neg_text.to_csv(os.path.join(path, 'csv/neg_text_nlp.csv'), encoding='utf8')
    wordcloud = WordCloud(background_color="white", stopwords=stopwords, max_words=50).generate(
        neg_text['text'].to_string())
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title("Negative Words Used")
    plt.axis("off")
    if fs.exists(os.path.join(path, "graphs/sent_analysis_neg.png")):
        os.remove(os.path.join(settings.MEDIA_ROOT, os.path.join(path, "graphs/sent_analysis_neg.png")))
    plt.savefig(os.path.join(path, "graphs/sent_analysis_neg.png"))

    clean_pos_text = []

    for i in text:
        doc = nlp(i)

        for token in doc:
            if token.is_alpha and not token.is_stop:
                clean = {
                    'text': token.text,
                    'lemma': token.lemma_,
                    'part_of_speech': token.pos_,
                    'pos_tag': token.tag_}
                clean_pos_text.append(clean)

    pos_text = pd.DataFrame(clean_pos_text)
    pos_text.to_csv(os.path.join(path, 'csv/pos_text_nlp.csv'), encoding='utf8')
    wordcloud = WordCloud(background_color="white", stopwords=stopwords, max_words=50).generate(
        pos_text['text'].to_string())
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title("Positive Words Used")
    plt.axis("off")
    if fs.exists(os.path.join(path, "graphs/sent_analysis_pos.png")):
        os.remove(os.path.join(settings.MEDIA_ROOT, os.path.join(path, "graphs/sent_analysis_pos.png")))
    plt.savefig(os.path.join(path, "graphs/sent_analysis_pos.png"))
time.sleep(1)
        
search = UTILITY_COMPANY
numberOfUsers = 1

# find and respond to tweets
for tweet in tweepy.Cursor(api.search, search,lang='en').items(numberOfUsers):
    tweetId = tweet.user.id
    username = tweet.user.screen_name
    phrase = 'Ever thought about renewable energy? Check out my website'
    print (tweet.text)
    para = nltk.sent_tokenize(tweet.text) # this gives us a list of sentences

# now loop over each sentence and tokenize it separately
    for sentence in para:
        tokenized_para = nltk.word_tokenize(sentence)
        tags = nltk.pos_tag(tokenized_para)
        print(tags)
        tree = chunk.ne_chunk(tags)
        tree
        tree.draw()

#analyze sentiment in tweet
tool = SentimentIntensityAnalyzer()
for sentence in para:
    print(sentence)
    ss = tool.polarity_scores(sentence)
    for i in sorted(ss):
        print('{0}: {1}, '.format(i, ss[i]), end='') 
        if ss["compound"] < 0.0:
Beispiel #28
0
class VaderServiceEn:
    analyzer = SentimentIntensityAnalyzer()

    def getScore(self, review):
        return self.analyzer.polarity_scores(unidecode(review.lower()))
Beispiel #29
0
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg,
                                                       min_freq=4)
    len(unigram_feats)

    sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                       unigrams=unigram_feats)

    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    trainer = NaiveBayesClassifier.train
    classifier = sentim_analyzer.train(trainer, training_set)

    for key, value in sorted(sentim_analyzer.evaluate(test_set).items()):
        print('{0}: {1}'.format(key, value))

    sid = SentimentIntensityAnalyzer()

    auth = OAuthHandler(ckey, csecret)
    auth.set_access_token(atoken, asecret)
    twitterStream = Stream(auth, listener())
    #twitterStream.filter(track=["googl", "google", "goog"])
    twitterStream.filter(track=["aapl", "apple", "iphone", "itunes"])
    #ply.show()

except Exception as e:
    print(e)
    print("3")
    pass
Beispiel #30
0
''' read saved tweets per company '''

companies = ['Microsoft', 'Apple', 'Mastercard', 'Intel', 
             'Cisco_Systems', 'GDP', 'Dow_Jones']
path = os.path.abspath(os.getcwd())+"/"          
file_end = "_clean_20170101_20200401.csv"

companylist = {}
for company in companies:
    filename = path+company+file_end
    companylist[company] =  pd.read_csv(filename,sep='|',index_col = 0)

''' Sentiment Analysis '''
# Sentiment for each Tweet ----------------------------------------------------

sentiment_pt = SentimentIntensityAnalyzer()
companylist_sentiment = {}
for company, df in companylist.items():
    df["Comp"] = ''
    df["Negative"] = ''
    df["Neutral"] = ''
    df["Positive"] = ''
    df = df.dropna(subset=['Tweets']).reset_index(drop=True)
    for indexx, row in tqdm(df.T.iteritems()):
        try:            
            sentence_pt=unicodedata.normalize('NFKD', df.loc[indexx, 'Tweets'])
            sentence_pt_sentiment=sentiment_pt.polarity_scores(sentence_pt)
            df.at[indexx, 'Comp'] = sentence_pt_sentiment['compound']
            df.at[indexx, 'Negative'] = sentence_pt_sentiment['neg']
            df.at[indexx, 'Neutral'] = sentence_pt_sentiment['neu']
            df.at[indexx, 'Positive'] = sentence_pt_sentiment['pos']