def main(): myData = Data() evaluate = Sentiment() companies = myData.getCompanies() myScraper = Scraper(companies) url = str(input("Input a newspaper url: ")) #url = "https://www.marketwatch.com/" #url = "https://www.cnbc.com/investing/" #url = "https://www.ccn.com/" myScraper.getArticleLinks(url) occurances = myScraper.filterByCompany() occurances = occurances[:5] while True: for occurance in occurances: name = occurance[0] num = occurance[1] if num > 0: print(name + " has " + str(num) + " articles") company = str(input("Which company are you interested in: ")) headlines = myScraper.findRelatedArticles(company) print("We found these articles from the past 7 days: ") i = 1 for headline in headlines: print(str(i) + " - " + headline) i += 1 interest = int(input("Which article would you like to analyze? ")) article = myScraper.relatedArticles[interest - 1] cleanArticle = myScraper.parseArticle(article.url) print("The stats of the article are: ") evaluate.rankSentenceScores(cleanArticle, myData.getBasicDictionary())
def test(): s = Sentiment() r = Region( "/Users/aaronhe/Documents/NutStore/Aaron He/FDU/Big-Data-Communication/Stereotype-Analysis-in-NetEase-News-Comments/Dict/region_dict/region.txt" ) # 构造输入数据 text = [ ["潮汕人很帅,湖北人挺会做生意的!", "上海"], ["老铁牛逼!", "重庆"], ["我觉得很好吃啊", "北京"], ] df = pd.DataFrame(text, columns=["text", "src"]) print(df.head()) df = r.region_detect(df, on=["text"]) # dataFrame中批量添加region字段 print( s.sentiment_detect(df, on=["text"], srcs=["src"], dists=["region_1", "region_2", "region_3"])) print(s.output_record(src="北京"))
def CompareAngels(angel1, angel2, sentences): if angel1 is None or angel2 is None: return label1 = Sentiment.GetSentimentClass( angel1.PredictReviewScore(sentences)) label2 = Sentiment.GetSentimentClass( angel2.PredictReviewScore(sentences)) if label1 != label2: angel1.DumpDetails(sentences) angel2.DumpDetails(sentences)
class Conversation: def __init__(self, path, interval="month"): with open(path, 'r') as f: self.__soup = BeautifulSoup(f.read(), "html.parser") self.messages = [] self.name = self.__soup.find("title").text.replace( "Conversation with ", "") message_headers = self.__soup.find_all("div", class_="message_header") self.__span_meta = [ m.find("span", class_="meta").text for m in message_headers ] self.__fbt = FbTime(self.__span_meta) for m in self.__soup.find_all("div", class_="message"): span = m.find("span", class_="meta") self.messages.append( Message( m.find("span", class_="user").text, self.__fbt.span_meta_to_date(span.text, interval), span.text, m.next_sibling.text)) self.__sent = Sentiment(self.messages, self.__fbt) self.participants = self.__scrape_participants() def interaction_freq(self): times = self.__fbt.generate_time_dict() for date_str in self.__span_meta: time = date_str.split("at ")[1][:5] hour = time.split(":")[0] times[hour + ":00"] += 1 return times def interaction_timeline(self, name): dates = self.__fbt.generate_date_dict() for message in self.messages: if message.name == name: dates[message.date] += 1 return dates def sentiment_timeline(self, name): return self.__sent.sentiment_timeline(name) def avg_sentiment(self, name): return self.__sent.avg_sentiment(name) def __scrape_participants(self): users = [] for user_span in self.__soup.find_all("span", "user"): user_name = user_span.text if user_name not in users: users.append(user_name) return users
def text_api_doc(request): text = str(request.GET['text']) usecase = str(request.GET['usecase']) modelId = request.GET['modelId'] modelId = eval(modelId) clientId = str(request.GET['clientId']) params = str(request.GET['params']) modelMap_df = pd.read_csv(projectDir+"/config/modelMapping.csv") if modelId != None: usecaseMap = ''.join(modelMap_df.ix[modelMap_df['modelId'] == modelId ,'usecase'].tolist()) modelName = modelMap_df.ix[modelMap_df['modelId'] == modelId ,'modelName'].tolist() if text == '' : results = {'Predicted Category':None,'Prediction Score':None} return HttpResponse(json.dumps(results)) if usecase.lower() != usecaseMap.lower(): return HttpResponse("<h1 style='color: red'>Please provide the correct usecase for the modelId:%s<h1>" %modelId) if usecase.lower() not in ['sentiment','survey']: folderName = modelName folderName = baseDir+"/"+''.join(folderName)+"/out" try: skl = SklCat() transformedXTest = skl.run_text_vectorizeTransform([text], folderName = folderName, vectorizerFile = 'FeatureTransformer.p', pickler = None) y_pred,y_prob_max = skl.run_text_execute_api_doc(transformedXTest, folderName = folderName, modelFile = 'Classifier.p', pickler=None, modelType = 'classification') results = {'Predicted Category':list(y_pred),'Prediction Score':list(y_prob_max)} except Exception as e: return HttpResponse("<h1 style='color: red'>Error Occured:%s</h1>" %e) elif usecase.lower() == 'sentiment': sentimentPattern= modelName sentimentPattern = ''.join(sentimentPattern) try: emo = Sentiment() results = emo.run(text, modelType = sentimentPattern, args = [], kwargs = {}) except Exception as e: return HttpResponse("<h1 style='color: red'>Error Occured:%s</h1>" %e) elif usecase.lower() == 'survey': ruleBaseFile = projectDir+"/config/ruleBase_v3.csv" try: sa = SurveyAnalysis() results = sa.run(text,ruleBaseFile = ruleBaseFile) except Exception as e: return HttpResponse("<h1 style='color: red'>Error Occured:%s</h1>" %e) else: return HttpResponse("<h1 style='color: red'>Please provide the proper usecase..</h1>") return HttpResponse(json.dumps(results)) else: return HttpResponse("<h1>Please provide the modelId in URL</h1>")
class Conversation: def __init__(self, path, interval="month"): with open(path, 'r') as f: self.__soup = BeautifulSoup(f.read(), "html.parser") self.messages = [] self.name = self.__soup.find("title").text.replace( "Conversation with ", "") message_headers = self.__soup.find_all("div", class_="message_header") self.__span_meta = [ m.find("span", class_="meta").text for m in message_headers ] self.__fbt = FbTime(self.__span_meta) for m in self.__soup.find_all("div", class_="message"): span = m.find("span", class_="meta") self.messages.append( Message( m.find("span", class_="user").text, self.__fbt.span_meta_to_date(span.text, interval), span.text, m.next_sibling.text)) self.__sent = Sentiment(self.messages, self.__fbt) self.participants = self.__scrape_participants() def interaction_freq(self): return self.__fbt.interaction_freq() def interaction_timeline(self, name): return self.__fbt.interaction_timeline(name, self.messages) def sentiment_timeline(self, name, interval): return self.__sent.sentiment_timeline(name, interval) def avg_sentiment(self, name): return self.__sent.avg_sentiment(name) def get24HourTime(self, elem): return self.__fbt.get24HourTime(elem) # Returns a list of participants in the conversation. def __scrape_participants(self): users = [] for user_span in self.__soup.find_all("span", "user"): user_name = user_span.text if user_name not in users: users.append(user_name) return users
def main(): # 数据加载 date = sys.argv[1] path_prefix = "./new_data" df = pd.read_pickle( os.path.join(path_prefix, "%s_select_comments.p" % date)) # 模型加载 s = Sentiment() df = s.sentiment_detect(df, on=["content"], srcs=["province"], dists=["region_1", "region_2", "region_3"]) df_freq = s.table_record() # 结果保存 df.to_pickle(os.path.join(path_prefix, "%s_sentiment.p" % date)) df_freq.to_pickle(os.path.join(path_prefix, "%s_senti_freq.p" % date)) print(df) print(df_freq)
def cut(self, infile, word_outile=None, nominal_outfile=None): ''' infile:输入文件 word_outfile:分词的输出文件 nominal_outfile:词性的输出结果 ''' try: contents = self.cut_to_sentence(infile) writer_word = None writer_nominal = None if word_outile: writer_word = open(word_outile, 'w') if nominal_outfile: writer_nominal = open(nominal_outfile, 'w') sentiment = [0, 0, 0] if writer_word: for content in contents: words = list( set(jieba.cut(content, cut_all=True)) - self.stop_word) words = filter(lambda x: len(x) > 0, words) if len(words) > 0: tmp = reduce( lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2]), map(lambda x: Sentiment.get_sentiment(x), words)) sentiment[0] += tmp[0] sentiment[1] += tmp[1] sentiment[2] += tmp[2] writer_word.write(' '.join( map(lambda w: w.encode('utf8', 'ignore'), words)) + '\n') writer_word.close() stop_flag = set(['p', 'x', 'm', 'd', 'c', 'a']) if writer_nominal: for content in contents: words = pseg.cut(content) for w in words: if (w.word not in self.stop_word) and (w.flag not in stop_flag): writer_nominal.write( w.word.encode('utf8', 'ignore') + ' ' + w.flag.encode('utf8', 'ignore') + ' ' + unicode(hash(content)).encode( 'utf8', 'ignore') + '\n') writer_nominal.close() return sentiment except Exception, e: traceback.print_exc() print 'cut error: %s !' % e
def cut(self, infile, word_outile=None, nominal_outfile=None): """ infile:输入文件 word_outfile:分词的输出文件 nominal_outfile:词性的输出结果 """ try: contents = self.cut_to_sentence(infile) writer_word = None writer_nominal = None if word_outile: writer_word = open(word_outile, "w") if nominal_outfile: writer_nominal = open(nominal_outfile, "w") sentiment = [0, 0, 0] if writer_word: for content in contents: words = list(set(jieba.cut(content, cut_all=True)) - self.stop_word) words = filter(lambda x: len(x) > 0, words) if len(words) > 0: tmp = reduce( lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2]), map(lambda x: Sentiment.get_sentiment(x), words), ) sentiment[0] += tmp[0] sentiment[1] += tmp[1] sentiment[2] += tmp[2] writer_word.write(" ".join(map(lambda w: w.encode("utf8", "ignore"), words)) + "\n") writer_word.close() stop_flag = set(["p", "x", "m", "d", "c", "a"]) if writer_nominal: for content in contents: words = pseg.cut(content) for w in words: if (w.word not in self.stop_word) and (w.flag not in stop_flag): writer_nominal.write( w.word.encode("utf8", "ignore") + " " + w.flag.encode("utf8", "ignore") + " " + unicode(hash(content)).encode("utf8", "ignore") + "\n" ) writer_nominal.close() return sentiment except Exception, e: traceback.print_exc() print "cut error: %s !" % e
def __init__(self, path, interval="month"): with open(path, 'r') as f: self.__soup = BeautifulSoup(f.read(), "html.parser") self.messages = [] self.name = self.__soup.find("title").text.replace( "Conversation with ", "") message_headers = self.__soup.find_all("div", class_="message_header") self.__span_meta = [ m.find("span", class_="meta").text for m in message_headers ] self.__fbt = FbTime(self.__span_meta) for m in self.__soup.find_all("div", class_="message"): span = m.find("span", class_="meta") self.messages.append( Message( m.find("span", class_="user").text, self.__fbt.span_meta_to_date(span.text, interval), span.text, m.next_sibling.text)) self.__sent = Sentiment(self.messages, self.__fbt) self.participants = self.__scrape_participants()
def main(): ticker = raw_input("\n\n\n----------------------------------------------\nWelcome. Ready to trade? Pick a stock ticker: ") reuterObj = ReutersQuery() reuterVector = reuterObj.getQuery(ticker) sentimentObj = Sentiment() sentiments = sentimentObj.sentimentVectorize(reuterVector) yahooObj = YahooQuery() yahooVector = yahooObj.doYahooQuery(ticker, reuterVector) reuterDates = DateFormat() dates = reuterDates.fixDates(reuterVector) mergeObj = Merge() merged = mergeObj.mergeEverything(sentiments, yahooVector, dates) strategyObj = Strategy() metrics = strategyObj.runStrategy(ticker, merged) outputObj = Output() outputObj.putOutput(ticker, metrics, yahooVector, merged) print '\nThanks for trading with Vivek! Get money, get paid!'
def predict(self, filePath): #PREDICT lexicon = util.LoadLexiconFromCSV( "../files/lexicons/SentiWordNet_Lexicon_concise.csv") angel = Angel(lexicon, True) parsedReviewsPath = os.path.join(os.path.dirname(filePath), "YelpParsedReviews.json") with open(parsedReviewsPath, 'r') as file: TrainingFile = file.read() classificationData = json.loads(TrainingFile) for k in range(len(classificationData["ClassificationModel"])): current = classificationData["ClassificationModel"][str(k + 1)] notCount = current["NotCount"] if "Sentences" in current: if not isinstance(current["Sentences"], list): current["Sentences"] = [current["Sentences"]] sentences = current["Sentences"] else: continue current["Label"] = Sentiment.GetSentimentClass( angel.PredictReviewScore(sentences, notCount), 1) angel.DumpDetails(sentences, current["Label"]) return classificationData
def __init__(self): Sentiment.__init__(self, testMode=False)
"Alcoa upgraded to Buy from Hold at Deutsche Bank", 'Peter Thiel reports 6.6% passive stake in Palantir', 'Senvest Management reports 5.54% passive stake in GameStop', 'Citron shorting Palantir, sees $20 stock by end of 2020', 'Amazon.com assumed with an Outperform at Wolfe Research', 'Alcoa initiated with a Sell at Goldman Sachs', 'Palantir initiated with a Market Perform at William Blair', 'Alcoa options imply 10.0% move in share price post-earnings', 'GameStop believes it has sufficient liquidity to fund operations', 'GameStop says \'Reboot\' is delivering lower costs, reduced debt', 'Palantir, Rio Tinto sign multi-year enterprise partnership', 'PG&E begins deployment of Palantir\'s Foundry Software', 'Citi says sell Palantir on deceleration in growth, upcoming lockup expiry', 'Pentagon cybersecurity project slowed by flaws, Bloomberg says', 'Palantir awarded contract from Army worth up to $250M', 'Palantir awarded $44.4M contract from FDA', 'Fujitsu signs $8M contract as Palantir Foundry customer', 'Army Vantage reaffirms Palantir partnership with $114M agreement', 'Palantir provides update on partnership with Greece to support COVID-19 response', 'Palantir receives Army prototype contract to support network modernization', 'Soros takes stake in Palantir, exits TransDigm position', "AOC flags \'material risks\' to Palantir investors in SEC letter, TechCrunch says", "PetIQ \'a smart way\' to play pet care boom, Barron's says", ] for i in infos: print('') print(i) s = Sentiment() s.Parse(i, sid=sid, nlp=nlp) print(s)
def text_api(request): text = str(request.GET['text']) usecase = str(request.GET['usecase']) modelId = request.GET['modelId'] modelId = eval(modelId) clientId = str(request.GET['clientId']) params = str(request.GET['params']) modelMap_df = pd.read_csv(projectDir+"/config/modelMapping.csv") results = [] records = eval(text) if modelId != None: usecaseMap = ''.join(modelMap_df.ix[modelMap_df['modelId'] == modelId ,'usecase'].tolist()) modelName = modelMap_df.ix[modelMap_df['modelId'] == modelId ,'modelName'].tolist() if text == '' : results = {'Predicted Category':None,'Prediction Score':None} return HttpResponse(json.dumps(results)) if usecase.lower() != usecaseMap.lower(): return HttpResponse("<h1 style='color: red'>Please provide the correct usecase for the modelId:%s<h1>" %modelId) if usecase.lower() not in ['sentiment','survey','ltv']: df = pd.read_json(json.dumps(records)) # converting json string into dataframe folderName = modelName folderName = baseDir+"/"+''.join(folderName)+"/out" try: skl = SklCat() #df = df.ix[df['lineBy']==2,:] #filtering customer lines df['chat_intent'] = df['lineText'].apply(lambda x:skl.run_text_vectorizeTransform([str(x)], folderName = folderName, vectorizerFile = 'FeatureTransformer.p', pickler = None)).apply(lambda x: skl.run_text_execute_api_bulkDoc(x, folderName = folderName, modelFile = 'Classifier.p', pickler=None, modelType = 'classification')) dfResults = df.ix[:,['sessionId','chat_intent']] results = dfResults.to_json(orient='records') ''' for doc in records: text = doc['chat_text'] transformedXTest = skl.run_text_vectorizeTransform([text], folderName = folderName, vectorizerFile = 'FeatureTransformer.p', pickler = None) y_pred,y_prob_max = skl.run_text_execute(transformedXTest, folderName = folderName, modelFile = 'Classifier.p', pickler=None, modelType = 'classification') output = {'session_id':doc['session_id'],'chat_text':text ,'Predicted Category':list(y_pred),'Prediction Score':list(y_prob_max)} results.append(output) ''' return HttpResponse(json.dumps(results)) except Exception as e: return HttpResponse("<h1 style='color: red'>Error Occured:%s</h1>" %e) elif usecase.lower() == 'sentiment': sentimentPattern= modelName sentimentPattern = ''.join(sentimentPattern) records = eval(text) try: emo = Sentiment() results = [] jsonResults = {} for doc in records: jsonTemp = {} jsonTemp['sessionId'] = doc['sessionId'] df_chat_data = pd.read_json(json.dumps(doc['chat_data'])) df_chat_data['sentiment'] = df_chat_data['lineText'].apply(lambda row: emo.run(row,modelType = sentimentPattern,args = [],kwargs={})) df = df_chat_data.ix[:,['lineNum','sentiment']] jsonObj = df.to_json(orient ='records') each_line = json.loads(json.dumps(jsonObj)) jsonTemp['each_line'] = each_line chat_sentiment = {} chat_sentiment['overall'] = overallSentiment(df_chat_data['sentiment']) lastlines = 2 chat_sentiment['last_line_sentiment'] = df_chat_data['lineText'].tail(lastlines).apply(lambda row: emo.run(row,modelType = sentimentPattern,args = [],kwargs={})).to_json(orient='records') chat_sentiment['switches_to_postive'] = getPositivieSwitches(df_chat_data['sentiment'].tolist()) jsonTemp['chat_sentiment'] = chat_sentiment results.append(jsonTemp) ''' for doc in records: text = doc['lineText'] output = emo.run(text, modelType = sentimentPattern, args = [], kwargs = {}) output['sessionId']= doc['sessionId'] output['lineText'] = text results.append(output) ''' return HttpResponse(json.dumps(results ,indent=5)) except Exception as e: return HttpResponse("<h1 style='color: red'>Error Occured:%s</h1>" %e) elif usecase.lower() == 'survey': ruleBaseFile = projectDir+"/config/ruleBase_v3.csv" try: sa = SurveyAnalysis() results = sa.run(text,ruleBaseFile = ruleBaseFile) except Exception as e: return HttpResponse("<h1 style='color: red'>Error Occured:%s</h1>" %e) elif usecase.lower() == 'ltv': folderName = modelName folderName = baseDir+"/"+''.join(folderName)+"/out" skl = SklCat() results = [] jsonResults = {} try: for doc in records: jsonTemp = {} jsonTemp['sessionId'] = doc['sessionId'] df_chat_data = pd.read_json(json.dumps(doc['chat_data'])) if 'lineNum' not in df_chat_data.columns.tolist(): df_chat_data['lineNum'] = pd.Series(range(1, df_chat_data.shape[0]+1)) df_chat_data = df_chat_data.ix[df_chat_data['lineBy'] == 1,:] #filtering only agent lines #ltv = df_chat_data['lineText'].apply(lambda x:skl.run_text_vectorizeTransform([str(x)], folderName = folderName, vectorizerFile = 'FeatureTransformer.p', pickler = None)).apply(lambda x: skl.run_text_execute_api_bulkDoc(x, folderName = folderName, modelFile = 'Classifier.p', pickler=None, modelType = 'classification',usecase='ltv')) transformedXTest = skl.run_text_vectorizeTransform(df_chat_data['lineText'].tolist(), folderName = folderName, vectorizerFile = 'FeatureTransformer.p', pickler = None) dfProb = skl.run_text_execute_api_bulkDoc(transformedXTest, folderName = folderName, modelFile = 'Classifier.p', pickler=None, modelType = 'classification',usecase='ltv') #dfTest = df_chat_data.ix[df_chat_data['ltv'].apply(lambda x : len(x)!=0)] if len(dfProb) != 0: dfProb['lineNum'] = pd.Series(range(1, dfProb.shape[0]+1)) threshold = 0.2 dfMerged = pd.merge(df_chat_data, dfProb, how='left', left_on='lineNum', right_on='lineNum',suffixes = ('_x','_y')) dfMerged = dfMerged.ix[:,['lineNum','y_pred','y_prob_max']] dfMerged['flag'] = dfMerged.apply(lambda row: row['y_pred'] == 1 and row['y_prob_max'] > threshold,axis = 1) ltv_agg = 1 if dfMerged['flag'].any() else 0 jsonTemp['chat_ltv'] = {'category':ltv_agg} jsonObj = dfMerged.to_json(orient = 'records') each_line = json.loads(json.dumps(jsonObj)) jsonTemp['each_line'] = each_line results.append(jsonTemp) return HttpResponse(json.dumps(results)) except Exception as e: return HttpResponse("<h1 style='color: red'>Error Occured:%s</h1>" %e) else: return HttpResponse("<h1 style='color: red'>Please provide the proper usecase..</h1>") #return HttpResponse(json.dumps(results)) else: return HttpResponse("<h1>Please provide the modelId in URL</h1>")
def get_city_thresholds(self): for city in self.senitiment_cities: city_sentiment = Sentiment(city, self.to_tokenize, self.document_type) city_sentiment.run()
def get_word_frequency(self): for city in self.senitiment_cities: city_sentiment = Sentiment(city, self.to_tokenize, self.document_type) city_sentiment.get_word_frequency('biomass')
def get_corpus_size(self): total_number_of_docs = 0 for city in self.senitiment_cities: city_sentiment = Sentiment(city, self.to_tokenize, self.document_type) total_number_of_docs += len(city_sentiment.filenames) print(total_number_of_docs)
from twython import Twython from twython import TwythonStreamer from Sentiment import Sentiment import time # PLEASE REPLACE THESE WITH YOUR DEV ACCOUNT APP_KEY = '5uPcZuLaqQed6cZ87HoJpQ' APP_SECRET = 'khSFegHa2nIzsymI80x1UqlAUzE0tr9zbyYDq8U1eQ' OAUTH_TOKEN = '322195502-UxtFta4eNP6g12ZKrnqIQvnn1tPf5wim0HvYGu1w' OAUTH_TOKEN_SECRET = '19fQ7S3LDv6lkIDiGGMlBUeVvOP7Ct08QxhmpnELIorwU' AFINN_FILE = "AFINN-111.txt" happyCalc = Sentiment(AFINN_FILE) sentimentScore = 0 class MyTwythonStreamer(TwythonStreamer): myTweets = [] def on_success(self, data): if 'text' in data: tweet = data['text'].encode('utf-8') self.add_tweets(tweet) print tweet self.disconnect() def on_error(self, status_code, data): print status_code print data
from Sentiment import Sentiment sentiment = Sentiment() model = sentiment.train_model(load_data=True,nb_epoch=1,old_weight_path="output/weights.13-0.20.hdf5") tests = ['发货太慢了,商家服务太差.','脑白金强势登陆央视'] labels = sentiment.predict_label(model,tests) print(labels)
# Add spacy word analyzer import spacy from spacy.tokens import Token nlp = spacy.load('en_core_web_sm') from Sentiment import Sentiment,News import glob,os,sys import pandas as pd import matplotlib.pyplot as plt import numpy as np from ReadData import ALPACA_REST,runTicker,ConfigTable,ALPHA_TIMESERIES,GetTimeSlot,SQL_CURSOR from alpaca_trade_api.rest import TimeFrame import pmdarima as pmd from statsmodels.graphics.tsaplots import plot_acf, plot_pacf s=Sentiment() debug=False # create sentiment analyzer from nltk.sentiment.vader import SentimentIntensityAnalyzer sid = SentimentIntensityAnalyzer() import statsmodels.api as sm1 from statsmodels.sandbox.regression.predstd import wls_prediction_std # univariate stacked lstm example from numpy import array from keras.models import Sequential from keras.layers import LSTM from keras.layers import Dense from keras.utils import plot_model api = ALPACA_REST() inputTxt='Honest Company reports Q1 EPS (13c) vs. 1c last year' inputTxt='Lennar reports Q2 adjusted EPS $2.95, consensus $2.36'
def ImpactTraining(docPath, lexPath, lexiconID): """ Final score of the review is calculated as follows: (Score1*Multiplier1 + Score2*Multiplier2 ... ScoreN*MultiplierN) * BaseMultiplier = ReviewScore Ignoring BaseMultiplier for this training, assuming it has minimal impact (TODO: test this impact) ScoreK*MultiplierK/ReviewScore * 100 = PercentImpact (impact %age of word K on the final score) TotalAdjustment = Expected Score - FinalScore AdjustmentK = PercentImpact of TotalAdjustment (total adjustment needed for word K) Adjustment on word K for this review = AdjustmentK/MultiplierK Take the mean of all adjustments, and applying to the final lexicon, to get the new lexicon Repeat process until there is performance improvement. """ oldAccuracy = 0 oldAngel = None se = PerformanceTest(lexPath, docPath) while True: adjustments = defaultdict(list) newAngel = Angel(se.lexicon, smallReviews=True) expectedSentiment, predictedOverall = [], [] se.ResetIterator() while True: try: sentences, expectedLabel, notCount, docId = se.NextElement() expectedSentiment.append(expectedLabel) predictedScore = newAngel.PredictReviewScore( sentences, expectedLabel) predictedLabel = Sentiment.GetSentimentClass(predictedScore) predictedOverall.append(predictedLabel) if oldAngel is not None: oldPredictedLabel = Sentiment.GetSentimentClass( oldAngel.PredictReviewScore(sentences, expectedLabel)) if oldPredictedLabel != predictedLabel: oldAngel.DumpDetails(sentences, expectedLabel) newAngel.DumpDetails(sentences, expectedLabel) totalImpact, impactTable = newAngel.GetImpact(sentences) if totalImpact == 0: continue totalAdjustment = expectedLabel * 10 - predictedScore for word, (wordScore, multiplier) in impactTable.iteritems(): if multiplier == 0: continue wordAdjustment = ((wordScore / totalImpact) * totalAdjustment) / multiplier if wordAdjustment != 0: adjustments[word].append(wordAdjustment) except StopIteration: break newAccuracy = util.accuracy(predictedOverall, expectedSentiment) print "Accuracy:", oldAccuracy, "--->", newAccuracy if newAccuracy <= oldAccuracy: break for word in adjustments: se.lexicon[word] = str( float(se.lexicon[word]) + numpy.mean(adjustments[word])) oldAngel = newAngel oldAccuracy = newAccuracy filename = "../files/lexicons/" + lexiconID + ".csv" handle = open(filename, 'wb') wr = csv.writer(handle) for key, value in sorted(oldAngel.lexicon.items()): row = [key, value] wr.writerow(row) handle.close()
def PerformTest(self): """ This method loads the test data file, and tests how good the prediction is. It also prints the precision, recall and F1 scores. """ angel = Angel(self.lexicon, True) angel.SetDumpParameters(7, -7) posx, negx, neutx, accx, = 0, 0, 0, 0 maxnegf1 = maxneutf1 = maxposf1 = maxacc = 0 for threshold in range(1, 0, -1): predictedOverall = [] expectedSentiment = [] demons = TotPos = TotNeg = TotNeut = 0 while True: try: sentences, label, notCount, docId = self.NextElement() if not sentences: continue if label == 'NULL': break label = int(label) expectedSentiment.append(label) predicted = angel.PredictReviewScore(sentences, label) predictedOverall.append(Sentiment.GetSentimentClass(predicted, threshold)) if label == Sentiment.POSITIVE: TotPos += 1 elif label == Sentiment.NEGATIVE: TotNeg += 1 else: TotNeut += 1 if angel.DumpRequested(predicted, label): print "ID", docId, "\n" demons += 1 except StopIteration: break print "Demons:", demons pos_prec = util.precision_with_class(predictedOverall, expectedSentiment, 1) neg_prec = util.precision_with_class(predictedOverall, expectedSentiment, -1) neut_prec = util.precision_with_class(predictedOverall, expectedSentiment, 0) pos_rec = util.recall_with_class(predictedOverall, expectedSentiment, 1) neg_rec = util.recall_with_class(predictedOverall, expectedSentiment, -1) neut_rec = util.recall_with_class(predictedOverall, expectedSentiment, 0) pos_f1 = util.f1_with_class(predictedOverall, expectedSentiment, 1) neg_f1 = util.f1_with_class(predictedOverall, expectedSentiment, -1) neut_f1 = util.f1_with_class(predictedOverall, expectedSentiment, 0) accuracy = util.accuracy(predictedOverall, expectedSentiment) print "Current Positive stats (", threshold, "): ","\t", '{:.2%}'.format(pos_prec), \ "\t", '{:.2%}'.format(pos_rec), "\t", '{:.2%}'.format(pos_f1) print "Current Negative stats (", threshold, "): ", "\t", '{:.2%}'.format(neg_prec), "\t", \ '{:.2%}'.format(neg_rec), "\t", '{:.2%}'.format(neg_f1) print "Current Neutral stats (", threshold, "): ", "\t", '{:.2%}'.format(neut_prec), "\t", \ '{:.2%}'.format(neut_rec), "\t", '{:.2%}'.format(neut_f1) cprint("Current Accuracy ( " + str(threshold) + " ):\t\t\t" + '{:.2%}'.format(accuracy), 'red') if pos_f1 > maxposf1: maxposf1 = pos_f1 posx = threshold if neg_f1 > maxnegf1: maxnegf1 = neg_f1 negx = threshold if neut_f1 > maxneutf1: maxneutf1 = neut_f1 neutx = threshold if accuracy > maxacc: maxacc = accuracy accx = threshold print "Maximum Positive F1: ", '{:.2%}'.format(maxposf1), "at", posx print "Maximum Negative F1: ", '{:.2%}'.format(maxnegf1), "at", negx print "Maximum Neutral F1: ", '{:.2%}'.format(maxneutf1), "at", neutx cprint("Maximum Accuracy: " + '{:.2%}'.format(maxacc) + " at " + str(accx), 'red')
# Add spacy word analyzer import spacy from spacy.tokens import Token nlp = spacy.load('en_core_web_sm') from Sentiment import Sentiment, News import glob, os, sys import pandas as pd import matplotlib.pyplot as plt import numpy as np from ReadData import ALPACA_REST, runTicker, ConfigTable, ALPHA_TIMESERIES, GetTimeSlot, SQL_CURSOR from alpaca_trade_api.rest import TimeFrame from statsmodels.tsa.arima.model import ARIMA import pmdarima as pmd from statsmodels.graphics.tsaplots import plot_acf, plot_pacf s = Sentiment() debug = False # create sentiment analyzer from nltk.sentiment.vader import SentimentIntensityAnalyzer sid = SentimentIntensityAnalyzer() import statsmodels.api as sm1 from statsmodels.sandbox.regression.predstd import wls_prediction_std import AnaSignal api = ALPACA_REST() inputTxt = 'Honest Company reports Q1 EPS (13c) vs. 1c last year' #inputTxt='Lennar reports Q2 adjusted EPS $2.95, consensus $2.36' #inputTxt='Cognyte reports Q1 EPS (20c), consensus (15c)' #inputTxt='Brookdale Senior Living resumed with a Buy at Stifel' #inputTxt='Anglo American price target raised to 3,670 GBp from 3,500 GBp at Morgan Stanley' #inputTxt='GMS Inc. reports Q4 adjusted EPS $1.07, consensus 82c' ##inputTxt='CalAmp reports Q1 adjusted EPS 8c, consensus 7c'
def getSentiments(self, path): dirs = os.listdir(path) for file in dirs: filename = path + file self.positive.clear() self.positive.clear() self.negative.clear() self.neutral.clear() self.compound.clear() self.type.clear() self.text.clear() self.score.clear() self.originalScore.clear() self.calculatedScore.clear() self.formatedText.clear() self.correctScore = 0 print(filename) count = 0 with open(filename) as csvfile: print("here") spamreader = csv.reader(csvfile, delimiter=',') for row in spamreader: count = count + 1 if count > 1: self.type.append(row[0]) self.text.append(row[1]) self.score.append((row[2])) for i in range(len(self.text)): findSentimentText = self.text[i] #print(findSentimentText) findSentimentText = encoding.smart_str(findSentimentText, encoding='ascii', errors='ignore') findSentimentText = findSentimentText.lower() findSentimentText = re.sub( '((www\.[^\s]+)|(https?://[^\s]+))', 'URL', findSentimentText) findSentimentText = re.sub('@[^\s]+', 'AT_USER', findSentimentText) findSentimentText = re.sub('[\s]+', ' ', findSentimentText) findSentimentText = re.sub(r'#([^\s]+)', r'\1', findSentimentText) findSentimentText = findSentimentText.strip('\'"') findSentimentText = re.sub('\\\[^\s]+', 'special_symbol', findSentimentText) findSentimentText = re.sub('\\\[^\s]+', 'special_symbol', findSentimentText) sentiment = Sentiment() scoreCal = sentiment.getSentimentNLTK(findSentimentText) self.positive.append(scoreCal[2]) self.negative.append(scoreCal[1]) self.neutral.append(scoreCal[0]) self.compound.append(scoreCal[3]) if (scoreCal[3] > 0.5): self.calculatedScore.append(1) else: self.calculatedScore.append(-1) self.calculateSentiment() total = len(self.score) print(total) print(self.correctScore) accuracy = (self.correctScore / float(total)) print(accuracy)
def __init__(self): Sentiment.__init__(self, testMode = False)