def main():
    start = time.time()

    # If output file does not exist, create it and provide it with the column names
    if not os.path.exists(file_output):
        with open(file_output, mode="w", newline='') as output:
            writer = csv.writer(output)
            writer.writerow(new_column_names)

    # Analyze the title of the show and make note of whether or not it is sentiment neutral
    neutral_title_mapping = {}
    with open('identifiers.csv', mode='r') as show_file:
        reader = csv.reader(show_file)
        for row in reader:
            show = row[0]
            sia = SIA()
            pol_score = sia.polarity_scores(show)
            # If show title isn't neutral, provide it with neutral one
            if float(pol_score['neu']) == 1.0:
                neutral_title_mapping[show] = True
            else:
                neutral_title_mapping[show] = False

    # Read the input file in 10000 record sized chunks, and for each tweet
    # calculate the sentiment, and append the results to the output file
    with open(file_input,
              encoding="utf8") as csv_file, open(file_output, 'a',
                                                 newline='') as output:
        writer = csv.writer(output)
        for data_chunk in pd.read_csv(csv_file, chunksize=10000):
            for record in data_chunk.itertuples(index=True, name='Pandas'):
                show = getattr(record, 'category')
                text = getattr(record, 'text')

                # Substitute placeholder for the title
                if neutral_title_mapping[show] == False:
                    text = re.sub(show.lower(), 'PLACEHOLDER', text)
                    print(show + ' ' + text)

                sia = SIA()
                results = sia.polarity_scores(text)
                columns = [
                    show,
                    getattr(record, 'status_id'),
                    getattr(record, 'in_reply_to_id'),
                    getattr(record, 'user_id'),
                    getattr(record, 'text'),
                    getattr(record, 'language'),
                    getattr(record, 'created_at'),
                    getattr(record, 'location'),
                    getattr(record, 'verified'), "Strong"
                ]
                writer.writerow(columns + [
                    results['neg'], results['neu'], results['pos'],
                    results['compound']
                ])

    end = time.time()
    print("Finished in %d seconds" % (end - start))
Exemple #2
0
def vader(sentence):
    # VADER (Valence Aware Dictionary and sEntiment Reasoner) - https://github.com/cjhutto/vaderSentiment
    # Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media
    # Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
    sid = SIA()
    ss = sid.polarity_scores(sentence)
    return [ss['neg'], ss['neu'], ss['pos'], ss['compound']]
    def sentiment_analyser(self, search_query):
        """
        Runs a Google News Search on the input string and then uses VADER sentiment analysis engine on each returned headline.
        Input: Search Query String
        Output: DataFrame with compound sentiment score for each news article
        """
        # Create a Covid-19 News DataFrame for each organization of interest
        news_df = self.covid19_news_scraper(search_query)
        # Initialize VADER Sentiment Intensity Analyzer 
        sia = SIA()
        results = []

        # Calculate the polarity score for each headline associated with the organization
        for row in news_df['Headline']:
            pol_score = sia.polarity_scores(row)
            pol_score['Headline'] = row
            results.append(pol_score)
        
        # Create the Sentiment DataFrame
        sent_df = pd.DataFrame.from_records(results)
        # Merge the two dataframes together on the 'Headline' column
        merge_df = news_df.merge(sent_df, on='Headline')
        # Re-order and Rename the columns
        merge_df = merge_df.rename(columns={'compound':'VADER Score'})
        col_order = ['Client','Date','Headline','Source','VADER Score','neg','neu','pos']
        print('Completed processing %s' % search_query, "...")
        return merge_df[col_order]
Exemple #4
0
def get_sentimentSubreddit(subreddit):

    subreddit = reddit.subreddit('{}'.format(subreddit))
    hot_btc = subreddit.hot(limit=None)
    headlines = set()

    for submission in hot_btc:
        headlines.add(submission.title)

    sia = SIA()
    results = []

    for line in headlines:
        pol_score = sia.polarity_scores(line)
        pol_score['headline'] = line
        results.append(pol_score)

    df = pd.DataFrame.from_records(results)

    df['label'] = 0
    df.loc[df['compound'] > 0.2, 'label'] = 1
    df.loc[df['compound'] < -0.2, 'label'] = -1

    # print(df.head(5))
    return df
Exemple #5
0
def Attribute(textList):
    attJson = readJson('./attribute.json')
    attDict = {}
    resultDict = {}
    sia = SIA()
    for i in textList:
        for j in attJson:
            for k in attJson[j]:
                if k in i:
                    if j in resultDict:
                        tmp = sia.polarity_scores(i)
                        for l in tmp.keys():
                            resultDict[j][l] = resultDict[j][l] + tmp[l]
                        resultDict[j]['Total'] = resultDict[j]['Total'] + 1
                    else:
                        resultDict[j] = sia.polarity_scores(i)
                        resultDict[j]['Total'] = 1
                    if j in attDict:
                        attDict[j] = attDict[j] + 1
                    else:
                        attDict[j] = 1
                    break
    for i in list(resultDict.keys()):
        for j in list(resultDict[i].keys())[0:3]:
            resultDict[i][j] = resultDict[i][j] / resultDict[i]['Total']
    return attDict, resultDict
Exemple #6
0
def output_csv_files(output_dfs, output_df_strs):
    assert (len(output_dfs) == len(output_df_strs))
    sia = SIA()
    for index in range(len(output_dfs)):
        preprocsplit = (lambda rev: preprocess_string(str(rev)))
        output_df, stem_name = output_dfs[index], output_df_strs[index]
        output_df[2] = (output_df[1].astype(str).apply(preprocsplit))
        new_output_df = pd.DataFrame()  #new output dataframe...
        for (_, row) in output_df.iterrows():
            if (('unprofession' in list(row[2])
                 or 'profession' in list(row[2]))):
                new_output_df = new_output_df.append(row, ignore_index=True)
        misclassified_df = pd.DataFrame()
        for (_, row) in new_output_df.iterrows():
            if (('unprofession' in list(row[2])
                 and sia.polarity_scores(row[0])['compound'] > 0.0)
                    or ('profession' in list(row[2])
                        and sia.polarity_scores(row[0])['compound'] < 0.0)):
                misclassified_df = misclassified_df.append(row,
                                                           ignore_index=True)
        f_name = 'misclassified_' + stem_name + '.csv'
        print(type(misclassified_df))
        print(f_name)
        misclassified_df.to_csv(f_name)
    return
def reddit_analysis():
    # nltk.download('vader_lexicon')
    reddit = praw.Reddit(client_id='7jUlWSWelE0zpg',
                         client_secret='V8W2JZJs05cYOFCRb6oMTN7TXvY',
                         user_agent='cyberdhirendra')
    headlines = set()
    for submission in reddit.subreddit('Amazon').new(limit=None):
        headlines.add(submission.title)
        display.clear_output()
    from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

    sia = SIA()
    results = []

    for line in headlines:
        pol_score = sia.polarity_scores(line)
        pol_score['headline'] = line
        results.append(pol_score)

    dtf = pd.DataFrame.from_records(results)
    dtf['label'] = 0
    dtf.loc[dtf['compound'] > 0.2, 'label'] = 1
    dtf.loc[dtf['compound'] < -0.2, 'label'] = -1
    counts = dtf.label.value_counts(normalize=True) * 100
    return counts[-1]
Exemple #8
0
    def derive_columns(data_frame):  # based on cols from data after cleaning

        data_frame['Feedback'] = data_frame['Positive Feedback'].map(
            str) + data_frame['Negative Feedback'].map(str)
        print('Finding mentioned sites and brands...')
        data_frame['Sites'] = data_frame.apply(mentioned_site, axis=1)
        data_frame['Sites2'] = data_frame.apply(convert_to_list,
                                                axis=1)  # TEMPORARY COLUMN
        data_frame['Brands'] = data_frame.apply(mentioned_brand, axis=1)
        print('Categorizing feedback into issue types...')
        data_frame = data_frame.merge(df['Feedback'].apply(lambda s: pd.Series(
            {'Issues': [k for k, v in WTI.items() if v.search(s)]})),
                                      left_index=True,
                                      right_index=True)
        print('Categorizing feedback into component types...')
        data_frame = data_frame.merge(df['Feedback'].apply(lambda s: pd.Series(
            {'Components': [k for k, v in WTC.items() if v.search(s)]})),
                                      left_index=True,
                                      right_index=True)

        sid = SIA()
        print('Applying sentiment analysis to feedback...')
        data_frame = data_frame.merge(df['Feedback'].apply(
            lambda s: pd.Series({'compound': evalSentences(sid, s)[0]})),
                                      left_index=True,
                                      right_index=True)

        return data_frame
    def sent_n(type_t):
        # "Negative"

        ter = pd.DataFrame()

        ter["int"] = type_df[type_t]["Interview"]
        ter["Interview Date"] = type_df[type_t]["Interview Date"]

        ter["positive"] = 0
        ter["negative"] = 0
        ter["easy"] = 0
        ter["neutral"] = 0
        ter["difficult"] = 0
        ter["compound"] = 0.0

        ter = ter.iloc[:-5, :]
        analyzer = SIA()

        for sentence, row in zip(ter["int"], list(range(ter["int"].shape[0]))):
            vs = analyzer.polarity_scores(sentence)
            ter["compound"][row] = float(vs["compound"])
            if vs["compound"] < 0.2:
                ter["negative"][row] = 1
            elif vs["compound"] > 0.5:
                ter["positive"][row] = 1
            else:
                ter["neutral"][row] = 1
                # print("{:-<65} {}".format(sentence, str(vs)))

        bad = ter[ter["negative"] == 1]
        bad = bad.sort_values("Interview Date", ascending=False)

        return bad
Exemple #10
0
def acquire_feed_data(feed_name, url, hash_key_array):
    ''' This method acquires data from a RSS URL link. News which are new are inserted into the database '''
    # get the feed data from the url
    feed = feedparser.parse(url)

    for post in feed.entries:
        # if post is already in the database, skip it
        #Get post attributes (Tite, pubdate, descriptopn...ect) from the news feed
        title = post.title  #.encode('utf-8')
        comment = "COMMENTS ARE SUPRESSED"  #post.comments
        link = post.link  #.encode('utf-8')
        pubdate = str(parse(post.published).strftime('%Y-%m-%d %H:%M:%S'))
        description = str(post.description.encode("utf=8"))
        hash_key = hashlib.md5(''.join((title, pubdate)).encode()).hexdigest()

        #if not check_new_hash_key_exists(hash_key):
        if not hash_key in hash_key_array:
            #posts_to_print.append(row)

            #initialize sentimental analysis class
            sia = SIA()
            #apply sentimental anaylysis algorithm on news Title and derive positve/negative score
            sentimental_score = sia.polarity_scores(title)
            pos_score = str(sentimental_score["pos"])
            neg_score = str(sentimental_score["neg"])

            #prepare row to be inserted inserted in DB
            row = (u' '.join(
                (hash_key, "|", feed_name, "|", title, "|", link, "|", comment,
                 "|", description, "|", pubdate, "|", pos_score, "|",
                 neg_score, "\n"))).encode('utf-8')
            #insert row into database
            log_news_in_db(hash_key, feed_name, title.replace("'", ""),
                           link.replace("'", ""), comment.replace("'", ""),
                           description, pubdate, pos_score, neg_score)
Exemple #11
0
    def __init__(self):

        self._sent_analyzer = SIA()
        self._word_tokenizer = WordPunctTokenizer().tokenize
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle').tokenize
        self._ids = []
Exemple #12
0
def get_rss(ticker):
    feed = f'https://www.nasdaq.com/feed/rssoutbound?symbol={ticker}'

    rss = feedparser.parse(feed)

    articles = rss.entries

    summaries = list()
    for article in articles:
        entry = dict()
        text = strip_html(article['summary']).replace('\n', '')
        if len(text) > 65:
            entry['text'] = text
            entry['time'] = article['published']
            entry['symbols'] = article['nasdaq_tickers'].split(',')
            summaries.append(entry)

    sia = SIA()
    sentiments = list()
    for summary in summaries:
        score = sia.polarity_scores(summary['text'])
        score['text'] = summary['text']
        time = ' '.join(summary['time'].split(' ')[1:4])
        score['time'] = datetime.strptime(time, '%d %b %Y')
        score['tickers'] = summary['symbols']
        sentiments.append(score)

    sentiment_df = pd.DataFrame.from_records(sentiments)
    return sentiment_df
def ColumnScoring(column):
     sentiments = []
     for entry in column:
         sid = SIA()
         sentiment = sid.polarity_scores(entry) ['pos']
         sentiments.append(sentiment)    
     return sentiments
Exemple #14
0
    def get_sentiment_score(self, comment_text):
        # Use sentiment intensity analyzer and store value in sentiment_score
        sia = SIA()

        sentiment_score = sia.polarity_scores(comment_text)['compound']

        return sentiment_score
Exemple #15
0
def main():
    test_df = load_test_data()

    test_data = test_df['HEADLINE'].tolist()
    check_test = test_df['SENTIMENT'].tolist()

    #nltk.download('vader_lexicon')
    sia = SIA()
    results = []

    for line in test_data:
        pol_score = sia.polarity_scores(line)
        results.append(pol_score)

    nltk_pred = []
    for line in results:
        if line['compound'] > 0:
            nltk_pred.append(1)
        else:
            nltk_pred.append(-1)

    print('NLTK F1 Score: [6] ',
          round(f1_score(check_test, nltk_pred, average='micro'), 3))

    final1 = nltk_pred
    correct = 0
    total = len(final1)
    for i in range(len(final1)):
        if final1[i] == check_test[i]:
            correct += 1

    print('Acc:', correct / total)
Exemple #16
0
def analyze_sentiment(comment_body):
    """
    Calculate the average sentiment of one comment's body text.

    The function calculates the compound polarity score using VADER for each
    word in the comment text and finds the average score for the comment.

    Args:
        comment_body: List of strings representing the text of the comment.

    Returns:
        A float representing the average compound polarity score for the
        comment.
    """
    sia = SIA()
    results = []

    for sentence in comment_body:
        pol_score = sia.polarity_scores(sentence)
        results.append(pol_score['compound'])

    try:
        return sum(results) / len(results)
    except ZeroDivisionError:
        return 0
def listToSentiment(theList):
    sia = SIA()
    sentimentList = []
    for i in range(len(theList)):
        pol_score = sia.polarity_scores(theList[i])
        sentimentList.append(pol_score["compound"])
    return sentimentList
def get_sentiment_of_sentence(sentence):
    # call for a sentiment analysis, article can be one or mroe sentence
    sia = SIA()
    sentiment_scores = sia.polarity_scores(sentence.lower())
    # store the sentiment score in MongoDB for this article if it is not there,
    # else return the existing sentiment score
    return sentiment_scores["compound"]  # only get the compound score
Exemple #19
0
def ticker_sentiment(ticker_name):
    sent_dic = {}
    sia = SIA()
    results = []

    ticker = yf.Ticker(ticker_name)
    tickername = ticker.info['longName']

    page = requests.get('https://news.google.com/rss/search?q=' + tickername)
    tree = html.fromstring(page.content) 

    date = tree.xpath('//pubdate/text()') 
    title = tree.xpath('//title/text()') 

    for line in title:
        pol_score = sia.polarity_scores(line)
        pol_score['headline'] = line
        results.append(pol_score)

    sent_df = pd.DataFrame.from_records(results)
    sent_df['polarity_score'] = sent_df['headline'].apply(lambda tweet: TextBlob(tweet).sentiment)
    sent_dic['Negative Score'] = sent_df.sum(axis = 0, skipna = True)['neg']
    sent_dic['Positive Score'] = sent_df.sum(axis = 0, skipna = True)['pos']

    return jsonify(sent_dic)
Exemple #20
0
def ingest_news(insource, inlink, inheader):
    '''
    Ingest news articles into DB

    parameter(s): news article source name (str), news site URL (str), HTML header type for article titles (str)

    returns: void.

    '''
    conn = sqlite3.connect('../data/sentiment.db')
    cursor = conn.cursor()

    #Setup sentiment capture.
    sia = SIA()
    senti = []

    #Call scraper.
    all_articles = get_news(inlink, inheader)

    #Insert scraped data and associated sentiment.
    for article in all_articles:
        score = sia.polarity_scores(article)
        cursor.execute("INSERT INTO Generalnews(source, title, date, sentiment) VALUES(?,?,?,?)", (insource, article, timestamp, float(score.get('compound'))))


    #Commit db changes and close db.
    conn.commit()
    conn.close()
def polarity(row):
    #Does the sentiment analysis stuff
    vader_path = os.getcwd(
    ) + '/nltk_data/sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt'
    sia = SIA(lexicon_file=vader_path)
    polarity = sia.polarity_scores(row)
    return polarity['compound']
def get_sentiment_analysis(cursor: list):
    if not cursor:
        return []

    sia = SIA()
    positive = 0
    negative = 0
    neutral = 0
    overall = 0

    for tweet in cursor:
        res = sia.polarity_scores(tweet["semi_processed_text"])

        if res["compound"] < -0.25:
            negative += 1
        elif res["compound"] > 0.25:
            positive += 1
        else:
            neutral += 1

    if positive >= negative and positive >= neutral:
        overall = 1
    elif neutral >= positive and neutral >= negative:
        overall = 0
    elif negative >= positive and negative >= neutral:
        overall = -1

    return [("Positive", positive * 100 / len(cursor)),
            ("Neutral", neutral * 100 / len(cursor)),
            ("Negative", negative * 100 / len(cursor))], overall
def analyze_sentiment(tweet):
    '''Utility function to classify the polarity of a tweet using textblob'''
    tweet = tweet.replace('#', '')
    tweet = tweet.replace('-', ' ')
    negative_words = [
        'genocide', 'apartheid', 'warcrimes', 'occupier', 'occupied',
        'occupation', 'boycott', 'freepalestine', 'jewhate', 'suppression',
        'oppression', 'settler', 'settlers', 'settlements', 'settlement',
        'boycottisrael', 'justice', 'infiltrated', 'justice4palestine',
        'stopisraeliapartheid', 'robbers', 'stole', 'stoparmingsrael',
        'preach', 'sanction', 'sanctions'
    ]
    positive_words = [
        'loveisrael', 'supportisrael', 'rockets', 'rocket', 'innovation',
        'startupnation', 'support', 'aid'
    ]
    sid = SIA()
    for word in negative_words:
        if word == 'genocide' or word == 'apartheid':
            sid.lexicon[word] = -2
        else:
            sid.lexicon[word] = -1
    for word in positive_words:
        sid.lexicon[word] = 1
    ss = sid.polarity_scores(tweet.lower())
    # taking the polarity
    return ss['compound']
 def get_sentiment(self, col="processed_text"):
     """Find compound sentiment score"""
     sia = SIA()
     self.df["score"] = self.df[col].apply(
         lambda x: sia.polarity_scores(str(x)))
     self.df["score"] = self.df["score"].apply(lambda x: x["compound"])
     return self
Exemple #25
0
def get_sentiment(formatted_text):
    CRYPTO_LEXICON = json.load(
        open("../SentimentTrader/Words/crypto_lexicon.json"))
    sia = SIA()
    sia.lexicon.update(CRYPTO_LEXICON)

    return sia.polarity_scores(formatted_text)
Exemple #26
0
def process_tweets_from_file(fin, fout):
    valid_count = 0
    invalid_count = 0
    sia = SIA()

    list_of_tweets = []
    with open(fin) as f:
        for line in f:
            j = json.loads(line)
            try:
                # validate date format
                created_at = datetime.strptime(j['created_at'],
                                               '%Y-%m-%dT%H:%M:%S')

                # text formatting
                formatted_text = format.format_full(j['text'])
                j['formatted_text'] = formatted_text

                # sentiment
                sent = sia.polarity_scores(formatted_text)
                j['sentiment'] = sent

                list_of_tweets.append(json.dumps(j))
                valid_count += 1
                if (valid_count % 25000 == 0):
                    print(valid_count)
            except ValueError:
                invalid_count += 1
                if (invalid_count % 100 == 0):
                    print('Invalid:' + str(created_at), j['created_at'])
                continue

    with open(fout, 'w') as f:
        for tweet in list_of_tweets:
            f.write(tweet + '\n')
Exemple #27
0
def process_tweets_from_file(fin, fout):
    valid_count = 0
    invalid_count = 0
    sia = SIA()
    list_of_tweets = []
    
    reader = csv.reader(open('/home/amit/Desktop/Sentiment_analysis/Bitcoin-price-prediction-model-master/coindesk-bpi-USD-close_data-2010-07-18_2018-06-07.csv', 'r'))
    d = {}
    for row in reader:
        k, v = row
        print(k)
        k = datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S').date()
        #print(k)
        d[k] = v
    print(d)
        
    with open(fin) as f:
        for line in f:
            j = json.loads(line)
            try:
                # validate date format
                created_at = datetime.datetime.strptime(j['created_at'], '%Y-%m-%dT%H:%M:%S')
                #sprint(type(created_at))
                this_date = created_at.date()
                print(this_date)
                next_date = this_date + datetime.timedelta(days=1)
                j['price'] = 0
                if(next_date > datetime.datetime.today().date()):
                	j['price'] = d[this_date]
                else:
	                j['price'] = d[next_date]

                # datetime.strptime('2016-10-27 22:58:14', '%Y-%m-%d %H:%M:%S')

                # text formatting
                formatted_text = format.format_full(j['text'])
                j['formatted_text'] = formatted_text

                # sentiment
                sent = sia.polarity_scores(formatted_text)
                j['sentiment'] = sent

                list_of_tweets.append(json.dumps(j))
                valid_count += 1
                if (valid_count % 25000 == 0):
                    print(valid_count)
            except ValueError:
                invalid_count += 1
                if (invalid_count % 100 == 0):
                    print('Invalid:' + str(created_at), j['created_at'])
                continue

            
    
    with open(fout, 'w') as f:
        for tweet in list_of_tweets:
            f.write(tweet+'\n')
    
    print("Successfully processed", len(list_of_tweets), "tweets")
Exemple #28
0
def sentiment_analysis(text, full_score=False):
    sia = SIA()
    score = sia.polarity_scores(text)
    if full_score:
        return score
    else:
        x = score['neu'] * 5 - abs(score['neg'] - score['pos'])
        return 1 / (1 + math.exp(-x))
Exemple #29
0
def nltk_sentiment(text):
    """
    Does sentiment analysis on a string of text and return the polarity scores of the result.
    
    pol_scores is an array with pos, neg, neu, and composite scores.
    """
    pol_score = SIA().polarity_scores(text)
    return pol_score
Exemple #30
0
 def __init__(self, filePath):
     #Inherents from HTMLParser class and initialize
     self.filePath = filePath
     self.dataFrame = pd.read_excel(self.filePath)
     self.stem = SnowballStemmer("english")
     self.sia = SIA()
     self.strict = False
     self.convert_charrefs = True
     self.fed = []