Beispiel #1
0
def update_lex():
    sia = SentimentIntensityAnalyzer()

    stock_lex = pd.read_csv("data/stock_lex.csv")
    stock_lex["sentiment"] = (stock_lex["Aff_Score"] + stock_lex["Neg_Score"]) / 2
    stock_lex = dict(zip(stock_lex.Item, stock_lex.sentiment))
    stock_lex = {k: v for k, v in stock_lex.items() if len(k.split(" ")) == 1}
    stock_lex_scaled = {}
    for k, v in stock_lex.items():
        if v > 0:
            stock_lex_scaled[k] = v / max(stock_lex.values()) * 4
        else:
            stock_lex_scaled[k] = v / min(stock_lex.values()) * -4

    positive = []
    with open("data/positive.csv", "r") as f:
        reader = csv.reader(f)
        for row in reader:
            positive.append(row[0].strip())

    negative = []
    with open("data/negative.csv", "r") as f:
        reader = csv.reader(f)
        for row in reader:
            negative.append(row[0].strip())

    final_lex = {}
    final_lex.update(stock_lex_scaled)
    final_lex.update({word.lower(): 2.0 for word in positive if word.lower() not in final_lex})
    final_lex.update({word.lower(): -2.0 for word in negative if word.lower() not in final_lex})
    sia.lexicon = final_lex
    return sia
def get_sentiment_analyzer():
    sia = SentimentIntensityAnalyzer()

    # stock market lexicon
    stock_lex = pd.read_csv('stock_lex.csv')
    stock_lex['sentiment'] = (stock_lex['Aff_Score'] +
                              stock_lex['Neg_Score']) / 2
    stock_lex = dict(zip(stock_lex.Item, stock_lex.sentiment))
    stock_lex = {k: v for k, v in stock_lex.items() if len(k.split(' ')) == 1}
    stock_lex_scaled = {}
    for k, v in stock_lex.items():
        if v > 0:
            stock_lex_scaled[k] = v / max(stock_lex.values()) * 4
        else:
            stock_lex_scaled[k] = v / min(stock_lex.values()) * -4

    # Loughran and McDonald
    positive = []
    with open('lm_positive.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            positive.append(row[0].strip())

    negative = []
    with open('lm_negative.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            entry = row[0].strip().split(" ")
            if len(entry) > 1:
                negative.extend(entry)
            else:
                negative.append(entry[0])

    final_lex = {}
    final_lex.update({word: 2.0 for word in positive})
    final_lex.update({word: -2.0 for word in negative})
    final_lex.update(stock_lex_scaled)
    final_lex.update(sia.lexicon)
    sia.lexicon = final_lex

    return sia
Beispiel #3
0
selectDF_hdfs = selectDF.withColumn("date",
                                    get_date("datetime").cast("Timestamp"))

selectDF_es = selectDF.withColumn("date",get_datetime("col.datetime").cast("String")) \
        .withColumnRenamed("related","symbol")

######################## sentiment analysis of news ############################
sia = SentimentIntensityAnalyzer()

#update lexicon
with open('lexicon_data/final_lex.json', 'r') as fp:
    final_lex = json.load(fp)

final_lex.update(sia.lexicon)
sia.lexicon = final_lex


def sentiment_analysis(txt1, txt2):
    text = txt1 + ' ' + txt2
    return sia.polarity_scores(text)['compound']


sentiment_analysis_udf = udf(sentiment_analysis, FloatType())

selectDF_es = selectDF_es.withColumn(
    "sentiment_score",
    sentiment_analysis_udf(selectDF_es['headline'], selectDF_es['summary']))

###############################################################################
Beispiel #4
0
negative = []
with open('lm_negative.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        entry = row[0].strip().split(" ")
        if len(entry) > 1:
            negative.extend(entry)
        else:
            negative.append(entry[0])

final_lex = {}
final_lex.update({word:2.0 for word in positive})
final_lex.update({word:-2.0 for word in negative})
final_lex.update(stock_lex_scaled)
final_lex.update(vader.lexicon)
vader.lexicon = final_lex

scores = result['headline'].apply(vader.polarity_scores).tolist()
scores_df = pd.DataFrame(scores)
result = result.join(scores_df, rsuffix='_right')
result = result.drop(['neg', 'neu', 'pos', 'date', 'ticker'], axis=1)

result.loc[result.compound <= -0.05, "compound"] = 'Negative' # negative
result.loc[result.compound >= 0.05, "compound"] = 'Positive' # positive
result.loc[(result.compound != 1) & (result.compound != 0), "compound"] = 'Neutral' # neutral
result = result.rename(columns={"compound": "label"})
result.label = result.label.astype(string)

with open('dataset.txt', 'w') as f: result.to_string(f, col_space=4,index=None)

# Após esse passo, corrigir no editor de texto a estrutura do texto do dataset, de modo que ele fique com a seguinte estrutura: headline -> tabulação -> label da headline (pos, neg, neu)
Beispiel #5
0
                             names=colnames,
                             header=None,
                             delim_whitespace=True)
negative_words = pd.read_csv(url_neg,
                             names=colnames,
                             header=None,
                             delim_whitespace=True)

positive = positive_words['word'].to_list()
negative = negative_words['word'].to_list()

custom_lexicon = {}
custom_lexicon.update({word: 2.0 for word in positive})
custom_lexicon.update({word: -2.0 for word in negative})
custom_lexicon.update(analyzer.lexicon)
analyzer.lexicon = custom_lexicon
# End VADER Lexicon customization

# Dictionary to hold score and date results

scores = {}

# Iterate through search results page.
# Minimum value is "(1,'2')" this will go back to late Feb
# For most useful analysis with a still reasonable runtime "range(1,10)"
for i in range(1, 10):
    page = urlopen('https://www.businesstimes.com.sg/search/microsoft?page=' +
                   str(i)).read()
    soup = BeautifulSoup(page, features="html.parser")
    # Find the html tag matching <div class="media-body">
    # This is the Headline, Paragraph, and Link for search results aka 'post'
Beispiel #6
0
    for word in positive:
        check = sid.polarity_scores(word)['compound']
        if (check) >= 0.1:
            pos.update({word: (check * test_p)})
        else:
            pos.update({word: (av_p * test_p)})

    for word in negative:
        check = sid.polarity_scores(word)['compound']
        if check <= -0.1:
            neg.update({word: (check * test_n)})
        else:
            neg.update({word: (av_n * test_n)})

    final_Lex = {}
    final_Lex.update(sid.lexicon)
    final_Lex.update(pos)
    final_Lex.update(neg)
    sid.lexicon = final_Lex
    scores = sid.polarity_scores(contents)
    if (scores['compound'] > comp) and (scores['compound'] > 0):
        comp = scores['compound']
        value_n = test_n
        value_p = test_p
    test_p = test_p + 0.001
    test_n = test_n - 0.001

print(comp)
print(value_n)
print(value_p)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from finnews.config import config
import pickle
import os


PROJECT_DIR = os.path.join(os.path.dirname(__file__), '..', '..')


vader_sia = SentimentIntensityAnalyzer()
with open(os.path.join(config['data_dir'], 'vader_lexicon.pkl'), 'rb') as lex_file:
    vader_sia.lexicon = pickle.load(lex_file)


def score_vader(text):
    return vader_sia.polarity_scores(text)['compound']