Beispiel #1
0
def get_score_LM(html):
    """
	Uses the Landom Mcdonald dictionary for sentiment analysis
	"""
    lm = ps.LM()
    tokens = lm.tokenize(html)
    score = lm.get_score(tokens)
    return score
Beispiel #2
0
def get_score_LM(html):
    """
	Uses the Landom Mcdonald dictionary for sentiment analysis
	"""
    # print "getting sentiment"
    lm = ps.LM()
    tokens = lm.tokenize(html)
    # print tokens
    score = lm.get_score(tokens)
    # print "returning sentiment"
    return score
def Sentiment(a):
    import pysentiment as ps
    lm = ps.LM()
    tokens = lm.tokenize(a)
    a= lm.get_score(tokens)['Polarity']
    if a>0:
        return "Positive" 
    elif(a==0):
        return "Neutral" 
    else:
        return "Negative" 
Beispiel #4
0
def Sentiment(a):   
    try:
        lm = ps.LM()
        tokens = lm.tokenize(a)
        a= lm.get_score(tokens)['Polarity']
        if a>0:
            return "Positive" 
        elif(a==0):
            return "Neutral" 
        else:
            return "Negative" 
    except UnicodeDecodeError:
        return "Neutral"
Beispiel #5
0
def get_score_LM(html):
    """
	Uses the Landom Mcdonald dictionary for sentiment analysis
	"""
    lm = ps.LM()
    tokens = lm.tokenize(html)
    tlock.acquire()
    print "num tokens is: " + str(len(tokens))
    tlock.release()
    score = lm.get_score(tokens)
    if debugger:
        tlock.acquire()
        print "getting sentiment"
        print tokens
        tlock.release()
    return score
Beispiel #6
0
# coding: utf-8

# In[10]:

import pysentiment as ps
from collections import Counter
import re
import random
from operator import truediv
import matplotlib.pyplot as plt
import operator
import pandas as pd
import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
lm = ps.LM()


#lm = ps.HIV4()
def generateDate(initialYear, initialMonth, initialDay, endY, endM, endD):
    initial = datetime.datetime(initialYear, initialMonth, initialDay)
    duration = datetime.datetime(endY, endM, endD) - datetime.datetime(
        initialYear, initialMonth, initialDay)
    duration = duration.days
    initial = datetime.datetime(initialYear, initialMonth, initialDay)
    dates = [initial]
    for i in range(1, duration + 1):
        dates.append((initial + datetime.timedelta(days=i)))
    #print(dates)
    return dates
import pandas as pd
import graphlab as gl
import pysentiment as py
from nltk.tokenize import sent_tokenize
import re
from textstat.textstat import textstat
from gensim import corpora, models
import data_clean as dc
import numpy as np

#Loading data into SFrame
df = pd.read_csv('key_dev_news.txt', sep='\t', encoding='latin-1')
sf = gl.SFrame(data=df)

#Loading LDA model for topic modeling and pysentiment module for financial sentiment analysis
lm = py.LM()
lda = models.ldamodel.LdaModel.load('lda1.model')

#Building the LDA model using news articles
sf['tokens'] = sf['content'].apply(lambda x: dc.tokenize_doc(x, 'STEM'))
tokens_text = [
    unicode('|'.join(i), errors='replace').split('|') for i in sf['tokens']
]
dictionary = corpora.Dictionary(tokens_text)
corpus = [dictionary.doc2bow(text) for text in tokens_text]
ldamat = lda[corpus]

#Building LDA topic arrays per topic
topic_arrays = np.zeros((30, len(ldamat)))
for i, x in enumerate(ldamat):
    for topic_no, contrib in x:
def predict_relevance(df):

    #Loading data into SFrame
    df[[a for a in df.columns.values]] = df[[a for a in df.columns.values
                                             ]].astype(str)
    tf = gl.SFrame(data=df)
    tf = tf.unique()

    #Loading LDA model for topic modeling, pysentiment module for financial sentiment analysis and the relevance prediction model
    lda = models.ldamodel.LdaModel.load('lda1.model')
    lm = py.LM()
    model = gl.load_model('relevance_model_64feat')

    #Building the LDA model using news articles
    tf['tokens'] = tf['content'].apply(lambda x: dc.tokenize_doc(x, 'STEM'))
    tokens_text = [
        unicode('|'.join(i), errors='replace').split('|') for i in tf['tokens']
    ]
    dictionary = corpora.Dictionary(tokens_text)
    corpus = [dictionary.doc2bow(text) for text in tokens_text]
    ldamat = lda[corpus]

    #Building LDA topic arrays per topic
    topic_arrays = np.zeros((30, len(ldamat)))
    for i, x in enumerate(ldamat):
        for topic_no, contrib in x:
            topic_arrays[topic_no, i] = contrib

    #Adding LDA topic arrays as feature columns as 'Tx'
    for i, x in enumerate(topic_arrays):
        tf['T' + str(i)] = gl.SArray(data=x, dtype=float)

    #Polarity feature extraction from content of news articles
    tf['Polarity_text'] = tf['content'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Polarity'])
    tf['Subjectivity_text'] = tf['content'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Subjectivity'])
    tf['Positive_text_wc'] = tf['content'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Positive'])
    tf['Negative_text_wc'] = tf['content'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Negative'])
    tf['Total_text_wc'] = tf['content'].apply(lambda x: len(lm.tokenize(x)))
    tf['Negative_text_rate'] = tf['Negative_text_wc'] / tf['Total_text_wc']
    tf['Positive_text_rate'] = tf['Positive_text_wc'] / tf['Total_text_wc']
    tf['Max_Polarity'] = tf['content'].apply(lambda x: max(
        [lm.get_score(lm.tokenize(y))['Polarity'] for y in sent_tokenize(x)]))
    tf['Min_Polarity'] = tf['content'].apply(lambda x: min(
        [lm.get_score(lm.tokenize(y))['Polarity'] for y in sent_tokenize(x)]))
    tf['Sentences_wc'] = tf['content'].apply(lambda x: len(sent_tokenize(x)))
    tf['Positive_sentrate'] = tf['Positive_text_wc'] / tf['Sentences_wc']
    tf['Negative_sentrate'] = tf['Negative_text_wc'] / tf['Sentences_wc']

    #Readability feature extraction from content of news articles
    tf['FRE_text'] = tf['content'].apply(
        lambda x: textstat.flesch_reading_ease(x))
    tf['FRE_tagged_text'] = tf['FRE_text'].apply(
        lambda x: 1 if x < 100 and x >= 90 else 2 if x < 90 and x >= 80 else 3
        if x < 80 and x >= 70 else 4 if x < 70 and x >= 60 else 5
        if x < 60 and x >= 50 else 6 if x < 50 and x >= 30 else 7)
    tf['FK_text'] = tf['content'].apply(
        lambda x: int(textstat.flesch_kincaid_grade(x)))
    tf['GFI_text'] = tf['content'].apply(lambda x: textstat.gunning_fog(x))
    tf['SMI_text'] = tf['content'].apply(lambda x: textstat.smog_index(x))
    tf['CLI_text'] = tf['content'].apply(
        lambda x: textstat.coleman_liau_index(x))
    tf['ARI_text'] = tf['content'].apply(
        lambda x: int(textstat.automated_readability_index(x)))
    tf['DC_text'] = tf['content'].apply(
        lambda x: textstat.dale_chall_readability_score(x))
    tf['Difficult_text_wc'] = tf['content'].apply(
        lambda x: textstat.difficult_words(x))

    #Hand-picked quantitative features - # of percentage occurrences
    percent_pattern = re.compile('((?:|0|[1-9]\d\d?)(?:\.\d{1,3})?)%')
    tf['Percent_occurrences'] = tf['content'].apply(
        lambda x: len(percent_pattern.findall(x)))

    #Polarity feature extraction from news headlines
    tf['Polarity_head'] = tf['title'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Polarity'])
    tf['Subjectivity_head'] = tf['title'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Subjectivity'])
    tf['Positive_head_wc'] = tf['title'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Positive'])
    tf['Negative_head_wc'] = tf['title'].apply(
        lambda x: lm.get_score(lm.tokenize(x))['Negative'])
    tf['Total_head_wc'] = tf['title'].apply(lambda x: len(lm.tokenize(x)))
    tf['Negative_head_rate'] = tf['Negative_head_wc'] / tf['Total_head_wc']
    tf['Positive_head_rate'] = tf['Positive_head_wc'] / tf['Total_head_wc']

    #Readability feature extraction from news headlines
    tf['FRE_head'] = tf['title'].apply(
        lambda x: textstat.flesch_reading_ease(x))
    tf['FRE_tagged_head'] = tf['FRE_head'].apply(
        lambda x: 1 if x < 100 and x >= 90 else 2 if x < 90 and x >= 80 else 3
        if x < 80 and x >= 70 else 4 if x < 70 and x >= 60 else 5
        if x < 60 and x >= 50 else 6 if x < 50 and x >= 30 else 7)
    tf['FK_head'] = tf['title'].apply(
        lambda x: int(textstat.flesch_kincaid_grade(x)))
    tf['GFI_head'] = tf['title'].apply(lambda x: textstat.gunning_fog(x))
    tf['SMI_head'] = tf['title'].apply(lambda x: textstat.smog_index(x))
    tf['CLI_head'] = tf['title'].apply(
        lambda x: textstat.coleman_liau_index(x))
    tf['ARI_head'] = tf['title'].apply(
        lambda x: int(textstat.automated_readability_index(x)))
    tf['DC_head'] = tf['title'].apply(
        lambda x: textstat.dale_chall_readability_score(x))
    tf['Difficult_head_wc'] = tf['title'].apply(
        lambda x: textstat.difficult_words(x))

    #Predicting relevance class using these features in sorted order of confidence
    tf = tf.add_row_number()
    pred = model.classify(tf)
    pred = pred.add_row_number()
    relevant = pred.sort('probability', ascending=False)[:10]
    relevant = pred[pred['class'] == 1]
    non_relevant = pred[pred['class'] == 0]
    if relevant.num_rows() > 10:
        relevant_news_out = tf.join(relevant).sort('probability',
                                                   ascending=False)[:10]
    else:
        relevant_news = relevant.sort('probability', ascending=False)
        req_num_non_relevant_news = 10 - relevant.num_rows()
        non_relevant_news = non_relevant.sort(
            'probability')[:req_num_non_relevant_news]
        relevant_news = relevant_news.append(non_relevant_news)
        relevant_news_out = tf.join(relevant_news)

    return relevant_news_out
def sa_mcdlou(txt):
    lm = ps.LM()
    tokens = lm.tokenize(txt)
    score = lm.get_score(tokens)
    print(f"{txt[:50]}...{txt[-50:]}\n{score}")
    return score
def Score(a):
    import pysentiment as ps
    lm = ps.LM()
    tokens = lm.tokenize(a)
    a= lm.get_score(tokens)['Polarity']
    return a
Beispiel #11
0
def Score(a):  
    #try: 
        lm = ps.LM()
        tokens = lm.tokenize(a)
        polScore = lm.get_score(tokens)['Polarity']
        return polScore