Beispiel #1
0
    def vader_analysis(self, conn_, c, date_, delta_day, index):
        """Method that performs Sentiment analysis on news headlines. For a given day, the analysis is made from 9:30 am
        until 9:30 am (EST) the next day.

        For example, February 25th 2021 in the table would be analysis from news from 9:30am this morning until
        9:29 am February 26th 2021.
        """

        isValid = self.check_size(conn_, c, date_, delta_day, index)

        #if the sample is >= `self.min_sample`, performs sentiment analysis using Vader
        if isValid:
            list_results = []
            c.execute(
                f"SELECT {self.news_header[2]}  from {self.ticker} where {self.news_header[1]} >= "
                f"{self.start_debut_tempo} and {self.news_header[1]} < {self.end_date_tempo}"
            )
            rows = c.fetchall()
            for row in rows:
                list_results.append(sia().polarity_scores(row[0])['compound'])

            #to make it easier, the sentiment score from previous day is on the same line than current day return
            #(index + 1 )
            self.pd_data.loc[index + 1,
                             self.sentiment_name] = mean(list_results)
        else:
            pass

        return self.pd_data
Beispiel #2
0
    def on_status(self, status):
        """
        When the client receives a tweet
            :param status:  the tweet status
        """
        try:
            if status._json["user"]["name"] not in company_username :
                if "extended_tweet" in status._json and "full_text" in status._json["extended_tweet"]:
                    status._json["text"] = status._json["extended_tweet"]["full_text"]
                text = status._json["text"]
                ps = sia().polarity_scores(text)
                score = ps["compound"]
                logging.info(f"tweet : {score}")
                status._json['score'] = score

                # Finding the company the tweet is about
                status._json["company"] = "Unknown"
                if status._json["entities"]["user_mentions"] in company_username :
                    status._json["company"] = dict_username[status._json["entities"]["user_mentions"]]
                else : 
                    for company in TARGET :
                        if company in text :
                            status._json["company"] = company

                self.producer.send(
                    self.topic,
                    value=status._json,
                )

        except StopIteration as e:
            self.producer.close()
            running = False
Beispiel #3
0
def calculate_sentiment():
    with open("jpmorgan.json") as json_file:
        for line in json_file:
            tweet = json.loads(line)
            if tweet['lang'] == 'en':
                tweet = tweet['full_text'].lower()
                current_sentiment = sia().polarity_scores(tweet)
                all_sentiment.append(current_sentiment['compound'])
                if current_sentiment['compound'] < 0:
                    negative_tweets.append(tweet)
                elif current_sentiment['compound'] > 0:
                    positive_tweets.append(tweet)
                if re.search('jpmcoin', tweet, flags=re.IGNORECASE):
                    product_sentiment.append(current_sentiment['compound'])
    company_sentiment_mean = statistics.mean(all_sentiment)
    product_sentiment_mean = statistics.mean(product_sentiment)
    print('\nJP Morgan\'s mean sentiment score: ', company_sentiment_mean)
    print('JPM Coin\'s mean sentiment score: ', product_sentiment_mean)
    if product_sentiment_mean > company_sentiment_mean:
        print(
            "As you can see, the product's mean sentiment score is higher than the company's score.\n"
        )
    else:
        print(
            "As you can see, the company's mean sentiment score is higher than the product's score.\n"
        )
    def on_status(self, status):

        try:
            if status.coordinates != None and status.place.country_code == 'US' and status.retweeted == False:
                id_str = status.id_str  # ID given to specific tweet
                name = status.user.screen_name  # Name of Tweeter
                created = status.created_at  # When the tweet was sent
                text = status.text  # The tweet

                coords = status.coordinates  # Coordinates from where the tweet was sent
                coords = json.dumps(
                    coords
                )  # drops the coords from json to string, so it can be stored in sql
                # Add sentiment analysis
                sid = sia()
                polarity = sid.polarity_scores(text)[
                    "compound"]  # Set sentiment score to "polarity" object
                if polarity != 0.0:

                    #Store Tweets into SQLite db
                    table = db["tweets"]
                    table.insert(
                        dict(
                            tweet_id=id_str,
                            user=name,
                            tweet_datetime=created,
                            text=text,
                            sentement_score=polarity,
                            coords=coords,
                        ))
                    print("Tweet Added", text)
        except TypeError:
            print('e')
            pass
Beispiel #5
0
def sentiment_processing(DataSet, senti_shape):
    split_article_content = []

    for element in DataSet['plot']:
        split_article_content.append(re.split("(?<=[.!?])\s+", element))

    sid = sia()
    senti_list = []

    for i in range(len(split_article_content)):
        words = split_article_content[i]
        sentiment_com, sentiment_pos, sentiment_neg, sentiment_neu = [], [], [], []
        script = []

        for word in words:
            ss = sid.polarity_scores(word)
            sentiment_com.append(ss['compound'])
            sentiment_pos.append(ss['pos'])
            sentiment_neg.append(ss['neg'])
            sentiment_neu.append(ss['neu'])
            script.append(word)

        percentile_list = pd.DataFrame({
            'sentiment_sc': sentiment_com,
            'sentiment_pos': sentiment_pos,
            'sentiment_neg': sentiment_neg,
            'sentiment_neu': sentiment_neu,
            'script': script
        })
        senti_list.append(percentile_list)

    sentiment_sc__ = []
    for i in range(len(senti_list)):
        temp = []

        for a in range(len(senti_list[i]["sentiment_sc"])):
            temp.append(senti_list[i]["sentiment_sc"][a])

        sentiment_sc__.append(temp)

    def pad(l, content, width):
        zero_ = [content] * (width - len(l))
        zero_.extend(l)
        return zero_

    padding_ = []
    for i in range(len(senti_list)):
        padding_.append(pad(sentiment_sc__[i], 0, senti_shape))

    for i in range(0, len(padding_)):
        if len(padding_[i]) != senti_shape:
            print(len(padding_[i]))

    second_x = np.array(padding_)
    sentiment = second_x.reshape(len(padding_), senti_shape)

    Sentiment_DataSet = {'sentiment': sentiment}

    return Sentiment_DataSet
def sentiment(in_doc):
    """
    Computer sentiment analysis markes for input Text.

    in_doc -- cleaned text
    """
    sent_ana = sia()
    ps = sent_ana.polarity_scores(in_doc)['compound']
    return ps
 def __init__(self):
     self.dv = Doc2Vec.load("./models/doc2vec_model")
     self.tf = pickle.load(open("models/tfidf_model.pkl", "rb"))
     self.svd = pickle.load(open("models/svd_model.pkl", "rb"))
     self.svd_feature_matrix = pickle.load(
         open("models/lsa_embeddings.pkl", "rb"))
     self.doctovec_feature_matrix = pickle.load(
         open("models/doctovec_embeddings.pkl", "rb"))
     self.df = df = pd.read_pickle("perfume_data.pkl")
     self.hal = sia()
Beispiel #8
0
def featurizer(texts, company_name='apple'):
    '''
    :param text: text in the form of a list of strings
    :param company_name: company name in string
    :return: featurized dictionary
    '''
    text = []
    for news in texts:
        text += word_tokenize(news)
    company_name = company_name.lower()
    increase = {
        'increase', 'up', 'rise', 'jump', 'rose', 'high', 'beating',
        'positive', 'gained', 'climbed', 'jumped', 'surged', 'rising',
        'increased', 'soared', 'surging', 'skyrocketed', 'climb', 'climbing',
        'gains', 'surge', 'grew', 'jumping'
    }
    decrease = {
        'decrease', 'down', 'fall', 'plunge', 'low', 'negative', 'fell',
        'lost', 'dropped', 'declined', 'tumbled', 'slipped', 'slumped',
        'dipped', 'plunged', 'falling', 'slid', 'plummeted', 'sank', 'decline',
        'dropping', 'tumbling'
    }
    # increase, decrease word mentioned in specific sentences or the entire text
    feature = {
        'mentioned vs not mentioned': 0,
        'company mentioned polarity scores': 0,
        'total polarity scores': 0,
        'word related to increase': 0,
        'word related to decrease': 0,
        'number of news': 0
    }  #'word related to increase': 0, 'word related to decrease':0
    sid = sia()
    mentioned_sentences = keyword_mentioned_sentence(text, company_name)
    mentioned_sentence_scores = [
        float(sid.polarity_scores(sent)['compound'])
        for sent in mentioned_sentences
    ]  # polarity score list for mentioned sentences.
    txt = [word.lower() for word in text]
    fd = nltk.FreqDist(txt)
    company_mentioned = fd[company_name]
    company_not_mentioned = len(sent_tokenize(
        ' '.join(txt))) - company_mentioned
    increase_related_freq = [fd[word] / fd.N() for word in increase]
    decrease_related_freq = [fd[word] / fd.N() for word in decrease]

    feature[
        'mentioned vs not mentioned'] = 1  #company_mentioned/company_not_mentioned
    feature['company mentioned polarity scores'] = sum(mentioned_sentence_scores) if len(mentioned_sentences) == 0\
                                                        else sum(mentioned_sentence_scores)/len(mentioned_sentences)
    feature['total polarity scores'] = float(
        sid.polarity_scores(' '.join(text))['compound'])
    feature['word related to increase'] = sum(increase_related_freq)
    feature['word related to decrease'] = sum(decrease_related_freq)

    return feature
def SA(text):
    score = sia().polarity_scores(text)
    print(score)
    neg = score['neg']
    pos = score['pos']
    if neg > pos:
        print("Negative sentiment")
    elif pos > neg:
        print("Positive sentiment")
    else:
        print("Neutral vibe")
def adj_intensifier(tagged_text, adj_candid_index):
    intensifier_list = []
    pairlist = []
    sid = sia()
    for i in adj_candid_index:
        intensifier = tagged_text[i][0].strip('[\'\"]')
        word = tagged_text[i + 1][0].strip('[\'\"]')
        pair = word + ' ' + intensifier
        word_intensity = sid.polarity_scores(word)['compound']
        pair_intensity = sid.polarity_scores(pair)['compound']
        # 아래 if의 조건을 만족할 경우 현재 loop에서 돌고 있는 adjective는 intensifier의 역할을 한다.
        if abs(word_intensity) > 0 and abs(word_intensity) < abs(pair_intensity) and\
                sign(word_intensity) == sign(pair_intensity):
            if tagged_text[i + 1][1] == 'NOUN':
                intensifier_list.append(intensifier.lower())
                pairlist.append((intensifier.lower(), word.lower()))

    return intensifier_list, pairlist
def adv_intensifier(tagged_text, adv_candid_index):
    intensifier_list = []
    pairlist = []
    sid = sia()
    for i in adv_candid_index:
        intensifier = tagged_text[i][0].strip('[\'\"]')
        word_one = tagged_text[i - 1][0].strip('[\'\"]')
        word_two = tagged_text[i + 1][0].strip('[\'\"]')
        pair_one = word_one + ' ' + intensifier
        pair_two = intensifier + ' ' + word_two
        word_intensity = sid.polarity_scores(word_two)['compound']
        pair_intensity = sid.polarity_scores(pair_two)['compound']
        # 아래 if의 조건을 만족할 경우 현재 loop에서 돌고 있는 adverb는 intensifier의 역할을 한다.
        if abs(word_intensity) > 0 and abs(word_intensity) < abs(pair_intensity) and\
            sign(word_intensity) == sign(pair_intensity):
            if tagged_text[i + 1] == 'VERB' or tagged_text[i + 1][1] == 'ADJ':
                intensifier_list.append(intensifier.lower())
                pairlist.append((intensifier.lower(), word_two.lower()))

    return intensifier_list, pairlist
Beispiel #12
0
from nltk.probability import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia
from glob import glob
import re
from string import punctuation as p
import numpy


def clean(words):
    # Function to clean out the metadata, tags, and headers
    clean_re = r'<p>(.*)'  # noqa: E501 regex to find text within <p> tags only. Didn't take header text
    clean_text = re.findall(clean_re, words)  # noqa: E501 use findall to get text and assign it
    join_clean = ' '.join(clean_text)  # noqa: E501 what findall returns is a list of strings, so those strings need to be joined again
    return join_clean

hal = sia()

with open('Mini-CORE/1+IN+EN+IN-IN-IN-IN+EN-EN-EN-EN+WIKI+9992596.txt', 'r') as my_file:  # noqa: E501
    text = my_file.read().lower()
    clean_text = clean(text)
    sentences = nltk.sent_tokenize(clean_text)
    tokens = nltk.word_tokenize(clean_text)
    tokens_fd = FreqDist(tokens)
   # print(tokens[0:5])
   # print(tokens_fd)
   # print(sentences[2])
    word_len = [] 
    for word in tokens:
        length = [len(word)]
        word_len.extend(length)
    average_length = numpy.mean(word_len)
Beispiel #13
0
          encoding='utf8') as csvFile, open(destinationFileName,
                                            'w',
                                            encoding='utf8') as reviewFile:
    line = csvFile.readline()
    line = csvFile.readline().replace('\n', '')
    header = 'tweets \n'
    reviewFile.write(header)
    while line != '':
        try:
            lineList = line.split(',')
            tweets = lineList[10]
            outputline = tweets + '\n'
            reviewFile.write(outputline)
            line = csvFile.readline().replace('\n', '')
            comment = tweets
            sid = sia()
            sentimentScores = sid.polarity_scores(comment)
            count += 1
        except SyntaxError:
            print('No review found')
        if sentimentScores['compound'] > 0:
            countPos += 1
        elif sentimentScores['compound'] < 0:
            countNeg += 1
        elif sentimentScores['compound'] == 00:
            countNeu += 1
    print('count =', count)
    print('Positive = ', countPos)
    print('Negative = ', countNeg)
    print('Neutral = ', countNeu)
posPer = polarPecent(countPos, count)
Beispiel #14
0
    label.append('Topic {}'.format([0, 1, 2, 3][i]))
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=cc, cmap=cm, marker='o', s=100)
    h1, = plt.plot(1, 1, color=colors[i], linewidth=3)
    h.append(h1)
plt.legend(h, label, loc="upper left")
plt.show()
model = models.LdaModel(corpus, id2word=dictionary, num_topics=4)
model.print_topics(4)

### ACCUMULATE FEELINGS

from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia

sentim = sia()

cc0 = []
for sentence in documents:
    cc0.append(sentim.polarity_scores(sentence))

neu = []
neg = []
for sentence in documents:
    ss = sentim.polarity_scores(sentence)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
        neg.append(ss[k])
        neu.append(k)
    print()
    print('\n')
Beispiel #15
0
# -*- coding: utf-8 -*-
"""
Created on Thu Dec  6 10:02:20 2018

@author: Ben
"""
import numpy as np
import matplotlib.pyplot as plt
from glob import glob

from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia

analyzer = sia()
files = glob('C:\\Users\\Ben\\Desktop\\F18_DIGHT360\\final-corpus\\*')
pscore = []
stars = []
rank = ['1', '2', '3', '4', '5']
xs = range(len(rank))
for name in files:
    with open(name, encoding='utf8') as reviews:
        for review in reviews:
            star = review[0]
            if star in rank:
                stars.append(star)
                polarity = analyzer.polarity_scores(review)
                pscore.append(polarity['compound'])
#                print('.', end=' ', flush=True)
            else:
                continue
index = np.arange(5)
print('done processing files')
from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia
import MySQLdb
import sys

exclude = [
    'Basilica', 'basilica', 'France', 'Paris', 'Montreal', ' de ', ' dans ',
    ' le '
]
analyzer = sia(
    lexicon_file='/var/www/html/cse30246/twinsight/project/vader_lexicon.txt')

db = MySQLdb.connect("localhost", "mstaines", "#1Smarty", "twinsight")

#Instantiate cursor
c = db.cursor()
w = db.cursor()
#Run query, return 1 result
c.execute("""SELECT tweetID, text FROM tweets2""")

count = 0
row = c.fetchone()
while row:
    sys.stdout.write("Row " + str(count) + "of less than 140000\r")
    text = row[1]
    if any(ex in text for ex in exclude):
        pass
    else:
        sent = analyzer.polarity_scores(text)
        w.execute("""UPDATE tweets2 SET sentiment = %s WHERE tweetID = %s""", (
            sent['compound'],
            row[0],
Beispiel #17
0
# These bits of data will be in this format ---- ['sentiment score of paragraph', paragraph number]
data_2018 = []
data_2017 = []
current_sentiment = []
all_sentiment = []
# Reset this variable every time a letter is done
paragraph_count = 1

# Loop through the paragraphs in 2017
for paragraph in letter2017.split("\n"):
    if paragraph == '':
        continue
    all_sentiment = []
    # Loop through the words in the paragraph
    for word in paragraph.split():
        current_sentiment = sia().polarity_scores(word)
        all_sentiment.append(current_sentiment['compound'])
    #     Add that paragraphs data to the 2017 data file
    data_2017.append([statistics.mean(all_sentiment), paragraph_count])
    # This keeps track of where you are in the letter
    paragraph_count += 1
    print(paragraph)
#     Creates a dataframe so we can plot the data later on
df_2017 = pd.DataFrame(data_2017, columns=['Sentiment', 'Paragraph'])
paragraph_count = 1

# Loop through the paragraphs in 2018
for paragraph in letter2018.split("\n"):
    if paragraph == '':
        continue
    all_sentiment = []
text

chars_to_remove = ["\t","\n"]
sc = set(chars_to_remove)
text=''.join([c for c in text if c not in sc])
text

sentences = sent_tokenize(text)
sentences2=sentences
sentences2

tokens = word_tokenize(text)
tokens

from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia
sentim=sia()

cc=[]
for sentence in sentences2:
    cc.append(sentim.polarity_scores(sentence))
len(cc)
len(sentences2)
cc[0]

neu=[]
neg=[]
for sentence in sentences2:
        ss = sentim.polarity_scores(sentence)
        for k in sorted(ss):
            print('{0}: {1}, '.format(k, ss[k]), end='')
            neg.append(ss[k])