def update_lex(): sia = SentimentIntensityAnalyzer() stock_lex = pd.read_csv("data/stock_lex.csv") stock_lex["sentiment"] = (stock_lex["Aff_Score"] + stock_lex["Neg_Score"]) / 2 stock_lex = dict(zip(stock_lex.Item, stock_lex.sentiment)) stock_lex = {k: v for k, v in stock_lex.items() if len(k.split(" ")) == 1} stock_lex_scaled = {} for k, v in stock_lex.items(): if v > 0: stock_lex_scaled[k] = v / max(stock_lex.values()) * 4 else: stock_lex_scaled[k] = v / min(stock_lex.values()) * -4 positive = [] with open("data/positive.csv", "r") as f: reader = csv.reader(f) for row in reader: positive.append(row[0].strip()) negative = [] with open("data/negative.csv", "r") as f: reader = csv.reader(f) for row in reader: negative.append(row[0].strip()) final_lex = {} final_lex.update(stock_lex_scaled) final_lex.update({word.lower(): 2.0 for word in positive if word.lower() not in final_lex}) final_lex.update({word.lower(): -2.0 for word in negative if word.lower() not in final_lex}) sia.lexicon = final_lex return sia
def get_sentiment_analyzer(): sia = SentimentIntensityAnalyzer() # stock market lexicon stock_lex = pd.read_csv('stock_lex.csv') stock_lex['sentiment'] = (stock_lex['Aff_Score'] + stock_lex['Neg_Score']) / 2 stock_lex = dict(zip(stock_lex.Item, stock_lex.sentiment)) stock_lex = {k: v for k, v in stock_lex.items() if len(k.split(' ')) == 1} stock_lex_scaled = {} for k, v in stock_lex.items(): if v > 0: stock_lex_scaled[k] = v / max(stock_lex.values()) * 4 else: stock_lex_scaled[k] = v / min(stock_lex.values()) * -4 # Loughran and McDonald positive = [] with open('lm_positive.csv', 'r') as f: reader = csv.reader(f) for row in reader: positive.append(row[0].strip()) negative = [] with open('lm_negative.csv', 'r') as f: reader = csv.reader(f) for row in reader: entry = row[0].strip().split(" ") if len(entry) > 1: negative.extend(entry) else: negative.append(entry[0]) final_lex = {} final_lex.update({word: 2.0 for word in positive}) final_lex.update({word: -2.0 for word in negative}) final_lex.update(stock_lex_scaled) final_lex.update(sia.lexicon) sia.lexicon = final_lex return sia
selectDF_hdfs = selectDF.withColumn("date", get_date("datetime").cast("Timestamp")) selectDF_es = selectDF.withColumn("date",get_datetime("col.datetime").cast("String")) \ .withColumnRenamed("related","symbol") ######################## sentiment analysis of news ############################ sia = SentimentIntensityAnalyzer() #update lexicon with open('lexicon_data/final_lex.json', 'r') as fp: final_lex = json.load(fp) final_lex.update(sia.lexicon) sia.lexicon = final_lex def sentiment_analysis(txt1, txt2): text = txt1 + ' ' + txt2 return sia.polarity_scores(text)['compound'] sentiment_analysis_udf = udf(sentiment_analysis, FloatType()) selectDF_es = selectDF_es.withColumn( "sentiment_score", sentiment_analysis_udf(selectDF_es['headline'], selectDF_es['summary'])) ###############################################################################
negative = [] with open('lm_negative.csv', 'r') as f: reader = csv.reader(f) for row in reader: entry = row[0].strip().split(" ") if len(entry) > 1: negative.extend(entry) else: negative.append(entry[0]) final_lex = {} final_lex.update({word:2.0 for word in positive}) final_lex.update({word:-2.0 for word in negative}) final_lex.update(stock_lex_scaled) final_lex.update(vader.lexicon) vader.lexicon = final_lex scores = result['headline'].apply(vader.polarity_scores).tolist() scores_df = pd.DataFrame(scores) result = result.join(scores_df, rsuffix='_right') result = result.drop(['neg', 'neu', 'pos', 'date', 'ticker'], axis=1) result.loc[result.compound <= -0.05, "compound"] = 'Negative' # negative result.loc[result.compound >= 0.05, "compound"] = 'Positive' # positive result.loc[(result.compound != 1) & (result.compound != 0), "compound"] = 'Neutral' # neutral result = result.rename(columns={"compound": "label"}) result.label = result.label.astype(string) with open('dataset.txt', 'w') as f: result.to_string(f, col_space=4,index=None) # Após esse passo, corrigir no editor de texto a estrutura do texto do dataset, de modo que ele fique com a seguinte estrutura: headline -> tabulação -> label da headline (pos, neg, neu)
names=colnames, header=None, delim_whitespace=True) negative_words = pd.read_csv(url_neg, names=colnames, header=None, delim_whitespace=True) positive = positive_words['word'].to_list() negative = negative_words['word'].to_list() custom_lexicon = {} custom_lexicon.update({word: 2.0 for word in positive}) custom_lexicon.update({word: -2.0 for word in negative}) custom_lexicon.update(analyzer.lexicon) analyzer.lexicon = custom_lexicon # End VADER Lexicon customization # Dictionary to hold score and date results scores = {} # Iterate through search results page. # Minimum value is "(1,'2')" this will go back to late Feb # For most useful analysis with a still reasonable runtime "range(1,10)" for i in range(1, 10): page = urlopen('https://www.businesstimes.com.sg/search/microsoft?page=' + str(i)).read() soup = BeautifulSoup(page, features="html.parser") # Find the html tag matching <div class="media-body"> # This is the Headline, Paragraph, and Link for search results aka 'post'
for word in positive: check = sid.polarity_scores(word)['compound'] if (check) >= 0.1: pos.update({word: (check * test_p)}) else: pos.update({word: (av_p * test_p)}) for word in negative: check = sid.polarity_scores(word)['compound'] if check <= -0.1: neg.update({word: (check * test_n)}) else: neg.update({word: (av_n * test_n)}) final_Lex = {} final_Lex.update(sid.lexicon) final_Lex.update(pos) final_Lex.update(neg) sid.lexicon = final_Lex scores = sid.polarity_scores(contents) if (scores['compound'] > comp) and (scores['compound'] > 0): comp = scores['compound'] value_n = test_n value_p = test_p test_p = test_p + 0.001 test_n = test_n - 0.001 print(comp) print(value_n) print(value_p)
from nltk.sentiment.vader import SentimentIntensityAnalyzer from finnews.config import config import pickle import os PROJECT_DIR = os.path.join(os.path.dirname(__file__), '..', '..') vader_sia = SentimentIntensityAnalyzer() with open(os.path.join(config['data_dir'], 'vader_lexicon.pkl'), 'rb') as lex_file: vader_sia.lexicon = pickle.load(lex_file) def score_vader(text): return vader_sia.polarity_scores(text)['compound']