def test(): s = Sentiment() r = Region( "/Users/aaronhe/Documents/NutStore/Aaron He/FDU/Big-Data-Communication/Stereotype-Analysis-in-NetEase-News-Comments/Dict/region_dict/region.txt" ) # 构造输入数据 text = [ ["潮汕人很帅,湖北人挺会做生意的!", "上海"], ["老铁牛逼!", "重庆"], ["我觉得很好吃啊", "北京"], ] df = pd.DataFrame(text, columns=["text", "src"]) print(df.head()) df = r.region_detect(df, on=["text"]) # dataFrame中批量添加region字段 print( s.sentiment_detect(df, on=["text"], srcs=["src"], dists=["region_1", "region_2", "region_3"])) print(s.output_record(src="北京"))
def main(): myData = Data() evaluate = Sentiment() companies = myData.getCompanies() myScraper = Scraper(companies) url = str(input("Input a newspaper url: ")) #url = "https://www.marketwatch.com/" #url = "https://www.cnbc.com/investing/" #url = "https://www.ccn.com/" myScraper.getArticleLinks(url) occurances = myScraper.filterByCompany() occurances = occurances[:5] while True: for occurance in occurances: name = occurance[0] num = occurance[1] if num > 0: print(name + " has " + str(num) + " articles") company = str(input("Which company are you interested in: ")) headlines = myScraper.findRelatedArticles(company) print("We found these articles from the past 7 days: ") i = 1 for headline in headlines: print(str(i) + " - " + headline) i += 1 interest = int(input("Which article would you like to analyze? ")) article = myScraper.relatedArticles[interest - 1] cleanArticle = myScraper.parseArticle(article.url) print("The stats of the article are: ") evaluate.rankSentenceScores(cleanArticle, myData.getBasicDictionary())
def main(): # 数据加载 date = sys.argv[1] path_prefix = "./new_data" df = pd.read_pickle( os.path.join(path_prefix, "%s_select_comments.p" % date)) # 模型加载 s = Sentiment() df = s.sentiment_detect(df, on=["content"], srcs=["province"], dists=["region_1", "region_2", "region_3"]) df_freq = s.table_record() # 结果保存 df.to_pickle(os.path.join(path_prefix, "%s_sentiment.p" % date)) df_freq.to_pickle(os.path.join(path_prefix, "%s_senti_freq.p" % date)) print(df) print(df_freq)
def __init__(self, path, interval="month"): with open(path, 'r') as f: self.__soup = BeautifulSoup(f.read(), "html.parser") self.messages = [] self.name = self.__soup.find("title").text.replace( "Conversation with ", "") message_headers = self.__soup.find_all("div", class_="message_header") self.__span_meta = [ m.find("span", class_="meta").text for m in message_headers ] self.__fbt = FbTime(self.__span_meta) for m in self.__soup.find_all("div", class_="message"): span = m.find("span", class_="meta") self.messages.append( Message( m.find("span", class_="user").text, self.__fbt.span_meta_to_date(span.text, interval), span.text, m.next_sibling.text)) self.__sent = Sentiment(self.messages, self.__fbt) self.participants = self.__scrape_participants()
def getSentiments(self, path): dirs = os.listdir(path) for file in dirs: filename = path + file self.positive.clear() self.positive.clear() self.negative.clear() self.neutral.clear() self.compound.clear() self.type.clear() self.text.clear() self.score.clear() self.originalScore.clear() self.calculatedScore.clear() self.formatedText.clear() self.correctScore = 0 print(filename) count = 0 with open(filename) as csvfile: print("here") spamreader = csv.reader(csvfile, delimiter=',') for row in spamreader: count = count + 1 if count > 1: self.type.append(row[0]) self.text.append(row[1]) self.score.append((row[2])) for i in range(len(self.text)): findSentimentText = self.text[i] #print(findSentimentText) findSentimentText = encoding.smart_str(findSentimentText, encoding='ascii', errors='ignore') findSentimentText = findSentimentText.lower() findSentimentText = re.sub( '((www\.[^\s]+)|(https?://[^\s]+))', 'URL', findSentimentText) findSentimentText = re.sub('@[^\s]+', 'AT_USER', findSentimentText) findSentimentText = re.sub('[\s]+', ' ', findSentimentText) findSentimentText = re.sub(r'#([^\s]+)', r'\1', findSentimentText) findSentimentText = findSentimentText.strip('\'"') findSentimentText = re.sub('\\\[^\s]+', 'special_symbol', findSentimentText) findSentimentText = re.sub('\\\[^\s]+', 'special_symbol', findSentimentText) sentiment = Sentiment() scoreCal = sentiment.getSentimentNLTK(findSentimentText) self.positive.append(scoreCal[2]) self.negative.append(scoreCal[1]) self.neutral.append(scoreCal[0]) self.compound.append(scoreCal[3]) if (scoreCal[3] > 0.5): self.calculatedScore.append(1) else: self.calculatedScore.append(-1) self.calculateSentiment() total = len(self.score) print(total) print(self.correctScore) accuracy = (self.correctScore / float(total)) print(accuracy)
# Add spacy word analyzer import spacy from spacy.tokens import Token nlp = spacy.load('en_core_web_sm') from Sentiment import Sentiment,News import glob,os,sys import pandas as pd import matplotlib.pyplot as plt import numpy as np from ReadData import ALPACA_REST,runTicker,ConfigTable,ALPHA_TIMESERIES,GetTimeSlot,SQL_CURSOR from alpaca_trade_api.rest import TimeFrame import pmdarima as pmd from statsmodels.graphics.tsaplots import plot_acf, plot_pacf s=Sentiment() debug=False # create sentiment analyzer from nltk.sentiment.vader import SentimentIntensityAnalyzer sid = SentimentIntensityAnalyzer() import statsmodels.api as sm1 from statsmodels.sandbox.regression.predstd import wls_prediction_std # univariate stacked lstm example from numpy import array from keras.models import Sequential from keras.layers import LSTM from keras.layers import Dense from keras.utils import plot_model api = ALPACA_REST() inputTxt='Honest Company reports Q1 EPS (13c) vs. 1c last year' inputTxt='Lennar reports Q2 adjusted EPS $2.95, consensus $2.36'
def get_corpus_size(self): total_number_of_docs = 0 for city in self.senitiment_cities: city_sentiment = Sentiment(city, self.to_tokenize, self.document_type) total_number_of_docs += len(city_sentiment.filenames) print(total_number_of_docs)
def get_word_frequency(self): for city in self.senitiment_cities: city_sentiment = Sentiment(city, self.to_tokenize, self.document_type) city_sentiment.get_word_frequency('biomass')
def get_city_thresholds(self): for city in self.senitiment_cities: city_sentiment = Sentiment(city, self.to_tokenize, self.document_type) city_sentiment.run()
from Sentiment import Sentiment sentiment = Sentiment() model = sentiment.train_model(load_data=True,nb_epoch=1,old_weight_path="output/weights.13-0.20.hdf5") tests = ['发货太慢了,商家服务太差.','脑白金强势登陆央视'] labels = sentiment.predict_label(model,tests) print(labels)