def sa_hiv4(txt): hiv4 = ps.HIV4() tokens = hiv4.tokenize(txt) # text can be tokenized by other ways # however, dict in HIV4 is preprocessed # by the default tokenizer in the library score = hiv4.get_score(tokens) print(f"{txt[:50]}...{txt[-50:]}\n{score}") return score
def get_score_HIV4(html): """ Uses the HIV4 dictionary for sentiment analysis """ hiv4 = ps.HIV4() tokens = hiv4.tokenize(html) score = hiv4.get_score(tokens) return score
def mapper_init(self): sys.stderr.write("Initializing module...\n") self.hiv4 = ps.HIV4() sys.stderr.write("--> Initialized sentimentor\n") # Get S3 connection conn = boto.connect_s3(AWS_ACCESS, AWS_SECRET) self.bucket = conn.get_bucket(BUCKET) sys.stderr.write("--> Created AWS connection\n")
def get_twitter_sentiment(ticker, company, auth_keys): # fetch oath tokens from config.ini to secure them ckey = auth_keys.get('ckey') csecret = auth_keys.get('csecret') atoken = auth_keys.get('atoken') asecret = auth_keys.get('asecret') auth = tweepy.OAuthHandler(ckey, csecret) auth.set_access_token(atoken, asecret) api = tweepy.API(auth) hiv4 = ps.HIV4() ss = '!!' # cleaning mark t = "" s = '' d = datetime.today() d7 = d - timedelta(days=7) d = d.strftime("%Y-%m-%d") d7 = d7.strftime("%Y-%m-%d") try: for tweet in tweepy.Cursor(api.search, q=company, since=str(d7), until=str(d), lang="en").items(): s = ss + tweet.text # cleaning the tweets s = s.replace(ss + 'RT ', '') result = re.sub(r"http\S+", "", s) # http matches literal characters # \S+ matches all non-whitespace characters (the end of the url) t = t + result + '\n' except Exception as e: time.sleep(10) try: for tweet in tweepy.Cursor(api.search, q=ticker, since=d7, until=d, lang="en").items(): s = ss + tweet.text # cleaning the tweets s = s.replace(ss + 'RT ', '') result = re.sub(r"http\S+", "", s) # http matches literal characters # \S+ matches all non-whitespace characters (the end of the url) t = t + result + '\n' except Exception as e: time.sleep(10) tokens = hiv4.tokenize(t) score = hiv4.get_score(tokens) return score
def getallsentiment(self, tweets): scores = [] for tweet in tweets: analyizer = ps.HIV4() tokens = analyizer.tokenize(tweet.text) score = analyizer.get_score(tokens) if score['Polarity'] != 0: scores += [score] return scores
def RunSentimentAnalysis(): hiv4 = ps.HIV4() db = Database() sql = u"select id from property;" propertyIds = db.runSqlQueryColumn(sql) for propertyId in propertyIds: property = Property(db, propertyId) property.analyseSentiment(hiv4)
def get_news_sentiment(ticker): ticker = ticker.upper() hiv4 = ps.HIV4() # -- Setup options = Options() options.add_argument("--headless") browser = webdriver.Firefox(firefox_options=options) # -- Parse #browser.get("https://seekingalpha.com/symbol/" + ticker + "/analysis-and-news?analysis_tab=focus&news_tab=news-all") browser.get("https://simulationstock.000webhostapp.com/MSFT.html") soup = BeautifulSoup(browser.page_source, "html5lib") x = '' for div_tag in soup.find_all( 'div', attrs={"class": "mc_list_texting right bullets"}, limit=7): for span_tag in div_tag.find_all( 'span', attrs={"class": "general_summary light_text bullets"}): x = x + span_tag.text print(x) tokens = hiv4.tokenize(x) score = hiv4.get_score(tokens) browser.close() return score
dictshow = SequenceSelection(dictionary=dict2, length=7, startindex=0) # Plot most frequent words n = range(len(dictshow)) plt.bar(n, dictshow.values(), align='center') plt.xticks(n, dictshow.keys()) plt.title("Most frequent Words") plt.savefig("FrequentWords.png", transparent=True) # Overview overview = SequenceSelection(dictionary=dict2, length=400, startindex=0) nOverview = range(len(overview.keys())) plt.bar(nOverview, overview.values(), color="g", tick_label="") plt.title("Word Frequency Overview") plt.xticks([]) #plt.savefig("overview.png") plt.savefig("overview.png", transparent=True) #plt.savefig('overview.png', transparent=True) # Sentiment Analysis hiv4 = ps.HIV4() tokens = hiv4.tokenize(cleantext) score = hiv4.get_score(tokens) print(score) # Polarity # Formula: (Positive - Negative)/(Positive + Negative) # Subjectivity # Formula: (Positive + Negative)/N
def sentiment(text): hiv4 = ps.HIV4() score = hiv4.get_score(text) return score
def mapper_init(self): sys.stderr.write("Initializing module...\n") self.hiv4 = ps.HIV4() sys.stderr.write("--> Initialized sentimentor\n")
def func(): feed = feedparser.parse( "https://news.google.com/news/rss/headlines/section/topic/BUSINESS.en_pk/Business?ned=en_pk&hl=en&gl=PK" ) # feed = feedparser.parse("http://feeds.feedburner.com/com/Yeor") wnl = WordNetLemmatizer() # feed_title = feed['feed']['title'] # feed_entries = feed.entries for entry in feed.entries: article_title = (entry.title).encode("utf-8") article_link = (entry.link).encode("utf-8") article_published_at = entry.published article_title = article_title.lower() print article_title, article_link, article_published_at #### Title processing # title = [wnl.lemmatize(i, j[0].lower()) if j[0].lower() in ['a', 'n', 'v'] else wnl.lemmatize(i) for i, j in # pos_tag(word_tokenize(article_title))] # # no_unicode = [] # # for items in title: # no_unicode.append(items.encode("utf-8")) # for words in title: # no_unicode += " " + words.encode("utf-8") # print "Lemmatized" # print no_unicode if article_title not in titles: # titles.append(article_title) # urls.append(article_link) titles.append(article_title) titles_urls.append(tuple([article_title, article_link])) extractor = Goose() article = extractor.extract(url=article_link) text = article.cleaned_text text = text.lower() stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(text) filtered_sentence = [ w for w in word_tokens if not w in stop_words ] for words in filtered_sentence: # filtered_sentence = words.encode('ascii', 'ignore') filtered_sentence = words.encode("utf-8") filtered_sentence = str(filtered_sentence) for w in word_tokens: if w not in stop_words: # filtered_sentence.append(w) filtered_sentence = filtered_sentence.encode("utf-8") filtered_sentence += " " + w # words = set(nltk.corpus.words.words()) filtered_sentence = re.sub(ur"[^\w\d\s]+", '', filtered_sentence) print "Non-lemmatized News:" print filtered_sentence filtered_sentence = [ wnl.lemmatize(i, j[0].lower()) if j[0].lower() in ['a', 'n', 'v'] else wnl.lemmatize(i) for i, j in pos_tag(word_tokenize(filtered_sentence)) ] no_unicode = [] for items in filtered_sentence: no_unicode.append(items.encode("utf-8")) # for words in filtered_sentence: # no_unicode += " " + words.encode("utf-8") hiv4 = ps.HIV4() # words = set(nltk.corpus.words.words()) # all_clean = filtered_sentence all_clean = " ".join(x for x in no_unicode) # cleaned = " ".join(w for w in nltk.wordpunct_tokenize(all_clean) \ # if w.lower() in words or not w.isalpha()) # print all_clean general = [ " kse ", " psx ", "pakistan stock exchange", "karachi stock exchange", "pakistan stock market" ] keywords_pso = [" pso ", "pakistan state oil"] keywords_engro = [" engro ", "engro fertilizer"] keywords_hbl = [" hbl ", "habib bank"] keywords_ubl = [" ubl ", "unite bank"] keywords_ogdcl = [ " ogdc ", " ogdcl ", "oil & gas development company", "oil and gas development company" ] flag = 0 # relevant = [] for words in general: if words in all_clean: flag = 1 # relevant.append(article_title) for words in keywords_pso: if words in all_clean: flag = 2 # relevant.append(article_title) for words in keywords_hbl: if words in all_clean: flag = 3 # relevant.append(article_title) for words in keywords_ogdcl: if words in all_clean: flag = 4 # relevant.append(article_title) for words in keywords_ubl: if words in all_clean: flag = 5 for words in keywords_engro: if words in all_clean: flag = 6 # relevant.append(article_title) if flag == 1: print "General Stock Market News" score = hiv4.get_score(all_clean.split()) print "Cleaned News Below:" print all_clean print "Score Below:" print score scores[article_title] = score['Polarity'] if flag == 2: print "News related to PSO" score = hiv4.get_score(all_clean.split()) print "Cleaned News Below:" print all_clean print "Score Below:" print score scores[article_title] = score['Polarity'] if flag == 3: print "News related to HBL" score = hiv4.get_score(all_clean.split()) print "Cleaned News Below:" print all_clean print "Score Below:" print score scores[article_title] = score['Polarity'] if flag == 4: print "News related to OGDCL" score = hiv4.get_score(all_clean.split()) print "Cleaned News Below:" print all_clean print "Score Below:" print score scores[article_title] = score['Polarity'] if flag == 5: print "News related to UBL" score = hiv4.get_score(all_clean.split()) print "Cleaned News Below:" print all_clean print "Score Below:" print score scores[article_title] = score['Polarity'] if flag == 6: print "News related to ENGRO" score = hiv4.get_score(all_clean.split()) print "Cleaned and Lemmatized News Below:" print all_clean print "Score Dictionary Below:" print score scores[article_title] = score['Polarity'] if flag == 0: print "News was irrelevant!" print titles print scores print "\n" else: print "Already Processed!\n" print "Exiting...\n"
# New Method # In[16]: import pysentiment as ps from collections import Counter import re import random from operator import truediv import matplotlib.pyplot as plt import operator import pandas as pd import datetime #lm = ps.LM() lm = ps.HIV4() def generateDate(initialYear, initialMonth, initialDay, endY, endM, endD): initial = datetime.datetime(initialYear, initialMonth, initialDay) duration = datetime.datetime(endY, endM, endD) - datetime.datetime( initialYear, initialMonth, initialDay) duration = duration.days initial = datetime.datetime(initialYear, initialMonth, initialDay) dates = [initial] for i in range(1, duration + 1): dates.append((initial + datetime.timedelta(days=i))) #print(dates) return dates
def __init__(self): self.stocks = {} self.sectors = {} self.user_classification = {} self.dict = ps.HIV4()
def sentiment(text): hiv4 = ps.HIV4() tokens = hiv4.tokenize(text.decode('utf-8')) score = hiv4.get_score(tokens) print (score)
def do_sentiment_scores(self, text): hiv4 = pysentiment.HIV4() tokens = hiv4.tokenize(text) score = hiv4.get_score(tokens) return (tokens, score)