def main(csv_file, txt_file, amount, name): with open(txt_file) as input_file: data = pd.read_csv(csv_file) rand_choices = random_sample(len(data), amount) new_df = data.loc[[i for i in rand_choices]] df = pd.DataFrame(new_df[new_df.columns[1:5]]) df["vader_score_neg"] = "" df["vader_score_neu"] = "" df["vader_score_pos"] = "" df["vader_score_compound"] = "" start = [item for item in data[data.columns[2]]] end = [item for item in data[data.columns[3]]] articles = [] container = list(input_file) for i, j in zip(start, end): articles.append(''.join(container[i:j])) for i in df.index.values: df.loc[i, 'vader_score_neg'] = vaderSentiment.sentiment( articles[i])['neg'] df.loc[i, 'vader_score_neu'] = vaderSentiment.sentiment( articles[i])['neu'] df.loc[i, 'vader_score_pos'] = vaderSentiment.sentiment( articles[i])['pos'] df.loc[i, 'vader_score_compound'] = vaderSentiment.sentiment( articles[i])['compound'] df.to_csv(name + '_sentiment.csv')
def get_bigram_sentiment_distribution(bigrams): """ Send the number of ([positive, neutral, negative], [positive, neutral, negative]) sentiment bigrams """ from vaderSentiment.vaderSentiment import sentiment from operator import itemgetter pos_pos = 0 pos_neu = 0 pos_neg = 0 neu_pos = 0 neu_neu = 0 neu_neg = 0 neg_pos = 0 neg_neu = 0 neg_neg = 0 for tup in bigrams: word_one, word_two = tup vs_one = sentiment(word_one) del vs_one['compound'] sent_one = sorted(vs_one.items(), key=itemgetter(1), reverse=True)[0][0] vs_two = sentiment(word_two) del vs_two['compound'] sent_two = sorted(vs_two.items(), key=itemgetter(1), reverse=True)[0][0] if sent_one == 'pos': if sent_two == 'pos': pos_pos += 1 elif sent_two == 'neu': pos_neu += 1 else: pos_neg += 1 elif sent_one == 'neu': if sent_two == 'pos': neu_pos += 1 elif sent_two == 'neu': neu_neu += 1 else: neu_neg += 1 else: if sent_two == 'pos': neg_pos += 1 elif sent_two == 'neu': neg_neu += 1 else: neg_neg += 1 return (pos_pos, pos_neu, pos_neg, neu_pos, neu_neu, neu_neg, neg_pos, neg_neu, neg_neg)
def add_sentiment_comment_features(): """ """ from pymongo import MongoClient from bson.objectid import ObjectId from tqdm import tqdm from vaderSentiment.vaderSentiment import sentiment from RMP_words_funk import strip_punctuation from pdb import set_trace set_trace() rmpdb = MongoClient('mongodb://localhost:27017')['rmpdb'] ds_profs_ten_over_cur = rmpdb['dataset_profs_five_over_less_ten'].find( {}, {'prof_id': 1}, no_cursor_timeout=True) for row in tqdm(ds_profs_ten_over_cur): id_string = row['prof_id'] prof_comments = rmpdb['profs'].find_one({'_id': ObjectId(id_string)}, { '_id': 0, 'all comments.rComments': 1 }) tokens = list() for comment in prof_comments['all comments']: tokens.extend(strip_punctuation(comment['rComments']).split()) insertion_dict = {'44': 0, '45': 0, '46': 0, '47': 0} for tok in tokens: vs = sentiment(tok.encode('utf-8')) if vs['pos'] > vs['neg']: insertion_dict['44'] += 1 elif vs['neg'] > vs['pos']: insertion_dict['45'] += 1 else: continue for comment in prof_comments['all comments']: vs = sentiment(comment['rComments'].encode('utf-8')) if vs['pos'] > vs['neg']: insertion_dict['46'] += 1 elif vs['neg'] > vs['pos']: insertion_dict['47'] += 1 else: continue result = rmpdb['dataset_profs_five_over_less_ten'].update_one( {'_id': row['_id']}, {'$set': insertion_dict})
def score(): phrase = request.args.get('phrase') if phrase: return jsonify(sentiment(phrase)) else: return jsonify({"error": "Phrase cannot be empty"})
def convert_to_rating(self, text): sentences = sent_tokenize(text) pos = 0 neg = 0 neu = 0 #Finds number of positive, negative and neutral sentences for sentence in sentences: vs = sentiment(sentence) if vs['neg'] > 0.0 and vs['compound'] < 0.0: neg = neg + 1 elif vs['neg'] > 0.0 and vs['compound'] > 0.0: if vs['pos'] > 0.0 and vs['neu'] > 0.0 and vs['compound'] > 0.5: pos = pos + 1 elif vs['pos'] == 0.0 and vs['neu'] > 0.0: neu = neu + 1 elif vs['neg'] == 0.0 and vs['compound'] > 0.0: if abs(vs['neu'] - vs['pos']) <= 0.3: pos = pos + 1 else: neu = neu + 1 else: neu = neu + 1 #Obtains total number of positive and negative sentences total = pos + neg if total == 0: total = 1 #Formula to convert text to rating on a scale of [1:5] rating = ((float(pos) / (total)) * 4.0) + 1 return rating
def add_sentiment_comment_features(): """ """ from pymongo import MongoClient from bson.objectid import ObjectId from tqdm import tqdm from vaderSentiment.vaderSentiment import sentiment from RMP_words_funk import strip_punctuation from pdb import set_trace set_trace() rmpdb = MongoClient('mongodb://localhost:27017')['rmpdb'] ds_profs_ten_over_cur = rmpdb['dataset_profs_five_over_less_ten'].find({},{'prof_id' : 1}, no_cursor_timeout = True) for row in tqdm(ds_profs_ten_over_cur): id_string = row['prof_id'] prof_comments = rmpdb['profs'].find_one({'_id' : ObjectId(id_string)}, {'_id' : 0, 'all comments.rComments' : 1}) tokens = list() for comment in prof_comments['all comments']: tokens.extend(strip_punctuation(comment['rComments']).split()) insertion_dict = { '44' : 0, '45' : 0, '46' : 0, '47' : 0} for tok in tokens: vs = sentiment(tok.encode('utf-8')) if vs['pos'] > vs['neg']: insertion_dict['44'] += 1 elif vs['neg'] > vs['pos']: insertion_dict['45'] += 1 else: continue for comment in prof_comments['all comments']: vs = sentiment(comment['rComments'].encode('utf-8')) if vs['pos'] > vs['neg']: insertion_dict['46'] += 1 elif vs['neg'] > vs['pos']: insertion_dict['47'] += 1 else: continue result = rmpdb['dataset_profs_five_over_less_ten'].update_one({'_id' : row['_id']}, {'$set' :insertion_dict})
def SaSentimentRSS(symbol): url = "http://seekingalpha.com/symbol/" + symbol + ".xml" url2 = "http://feeds.finance.yahoo.com/rss/2.0/headline?s=" + symbol + "®ion=US&lang=en-US" url3 = "http://www.google.ca/finance/company_news?q=" + symbol + "&output=rss" # gets list of links from above RSS feed NewsURLs = getSaURL(url) NewsURLs += RSS_URL.getURLs2(url2) NewsURLs += RSS_URL.getURLs2(url3) # String to be written to file toBeWrittenToFile = '' for link in NewsURLs: try: # gets article portion of the htmltext a = Article(link) a.download() a.parse() # not working if it's RSS title link or has no title or cannot be accessed if symbol in a.title and not 'Earnings Call Webcast' in a.title and not 'Stock Market Insights' in a.title and not '400 Bad Request' in a.title and not '403 Forbidden' in a.title and a.title != '': UnicodeArticle = a.text StringArticle = UnicodeArticle.encode('ascii', 'ignore') StrippedArticle = StringArticle.replace('\n', '') # not working with articles less than 300 words if len(StrippedArticle) > 200: # remove ascii symbols ArticleTitle = a.title.encode('ascii', 'ignore').replace(',', '') # filters out irrelevant articles if 'Transcript' not in ArticleTitle and 'Summary' not in ArticleTitle: # writes sentiment from sentiment API to file # locks this block so that only one thread can write to file at a time # vader sentiment dictionary s = vaderSentiment.sentiment(StrippedArticle) # not writing articles with zero sentiments # collect a string to be written to file if s['compound'] != 0: # print ArticleTitle toBeWrittenToFile += ( str(symbol) + ',' + str(s['neg']) + ',' + str(s['neu']) + ',' + str(s['pos']) + ',' + str( s['compound']) + ',' + ArticleTitle + ',' + str(link) + '\n') except Exception as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print message # write variable to file lock.acquire() try: myfile.write(toBeWrittenToFile) finally: lock.release()
def checkText(text): analyse = vaderSentiment.sentiment(text) compound = analyse["compound"] if compound < -0.5: return -1 elif compound > 0.5: return 1 else: return 0
def topNTweets(tweets, n, posThresh): posScores = np.zeros(len(tweets) + 1) negScores = np.zeros(len(tweets) + 1) tweetTxt = [0] * (len(tweets) + 1) i = 0 for t in tweets: posScores[i] = sentiment(tweets[i].id.encode("utf-8"))['pos'] negScores[i] = sentiment(tweets[i].id.encode("utf-8"))['neg'] tweetTxt[i] = tweets[i].id i += 1 posScores = posScores[posScores > posThresh] negScores = negScores[negScores < 0.2] topNIndices = np.argsort(posScores)[::-1][:n] # topNIndices = np.argsort(negScores)[0:n-1] topN = np.array(tweetTxt)[topNIndices] return topN
def SentimentRSS(symbol): url = "http://feeds.finance.yahoo.com/rss/2.0/headline?s=" + symbol + "®ion=US&lang=en-US" #gets list of links from above RSS feed NewsURLs = RSS_URL.getURLs2(url) #String to be written to file toBeWrittenToFile = '' for link in NewsURLs: try: #gets article portion of the htmltext a = Article(link) a.download() a.parse() #not working if it's RSS title link or has no title or cannot be accessed if not 'Stock - Yahoo! Finance' in a.title and not '400 Bad Request' in a.title and not '403 Forbidden' in a.title and a.title != '': UnicodeArticle = a.text StringArticle = UnicodeArticle.encode('ascii', 'ignore') StrippedArticle = StringArticle.replace('\n', '') #remove ascii symbols ArticleTitle = a.title.encode('ascii', 'ignore').replace(',', '') #writes sentiment from sentiment API to file #locks this block so that only one thread can write to file at a time #vader sentiment dictionary s = vaderSentiment.sentiment(StrippedArticle) #not writing articles with zero sentiments #collect a string to be written to file if s['compound'] != 0: print ArticleTitle toBeWrittenToFile += (str(symbol) + ',' + str(s['neg']) + ',' + str(s['neu']) + ',' + str(s['pos']) + ',' + str(s['compound']) + ',' + ArticleTitle + ',' + str(link) + '\n') except Exception as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print message lock.acquire() try: myfile.write(toBeWrittenToFile) finally: lock.release()
def topNTweets(tweets, n, posThresh): posScores = np.zeros(len(tweets)+1) negScores = np.zeros(len(tweets)+1) tweetTxt = [0]*(len(tweets)+1) i = 0 for t in tweets: posScores[i] = sentiment(tweets[i].id.encode("utf-8"))['pos'] negScores[i] = sentiment(tweets[i].id.encode("utf-8"))['neg'] tweetTxt[i] = tweets[i].id i += 1 posScores = posScores[posScores>posThresh] negScores = negScores[negScores<0.2] topNIndices = np.argsort(posScores)[::-1][:n] # topNIndices = np.argsort(negScores)[0:n-1] topN = np.array(tweetTxt)[topNIndices] return topN
def jsondata(sentence): #figure out if sentence contains but sentlist = sentence.split('but') return { 'sentence': sentence, 'sentiment': processSentiment(sentiment(sentence)), 'data': [{ 'rel': 'nmod', 'phrase': [{ 'value': t, 'sentiment': processSentiment(sentiment(t)), 'index': [0, 10], 'keyword': processKeyword(map(lambda x: x[0], rakeobj.run(t))) } for t in sentlist] }] }
def main(csv_file, txt_file, amount, name): with open(txt_file) as input_file: data = pd.read_csv(csv_file) rand_choices = random_sample(len(data), amount) new_df = data.loc[[i for i in rand_choices]] df = pd.DataFrame(new_df[new_df.columns[1:5]]) df["vader_score_neg"] = "" df["vader_score_neu"] = "" df["vader_score_pos"] = "" df["vader_score_compound"] = "" start = [item for item in data[data.columns[2]]] end = [item for item in data[data.columns[3]]] articles = [] container = list(input_file) for i,j in zip(start,end): articles.append(''.join(container[i:j])) for i in df.index.values: df.loc[i, 'vader_score_neg'] = vaderSentiment.sentiment(articles[i])['neg'] df.loc[i, 'vader_score_neu'] = vaderSentiment.sentiment(articles[i])['neu'] df.loc[i, 'vader_score_pos'] = vaderSentiment.sentiment(articles[i])['pos'] df.loc[i, 'vader_score_compound'] = vaderSentiment.sentiment(articles[i])['compound'] df.to_csv(name+'_sentiment.csv')
def gen_output(data, json_data_dir): term, is_reply, tweets_needed = data dataset = [] # get all user files files = glob.glob(os.path.join(json_data_dir, "*")) random.shuffle(files) for f in files: user = TwitterUser() user.populate_tweets_from_file(f, store_json=True, do_arabic_stemming=False, lemmatize=False) if 50 <= user.n_total_tweets <= 10000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: tweet_set = [t for t in user.tweets if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ len(t.tokens) > 5 and\ t.created_at >= MIN_TWEET_DATE and\ (term == '' or term in t.tokens) and\ langid.classify(t.text)[0] == 'en'and\ sentiment(t.text)['compound'] != 0] if is_reply: tweet_set = [t for t in tweet_set if t.reply_to] else: tweet_set = [t for t in tweet_set if not t.reply_to] if len(tweet_set) == 0: print 'size 0', term, tweets_needed, is_reply continue tweet = random.sample(tweet_set, 1)[0] print user.screen_name, term, tweets_needed, is_reply, ":::: ", tweet.text dataset.append(tweet) tweets_needed -= 1 if tweets_needed == 0: name = term if term != '' else 'random' name += '_reply' if is_reply else '_non_reply' pickle.dump(dataset, open(name + ".p", 'wb')) print 'done with: ', name, is_reply return else: print 'failed user'
def scrape_tweets(api, term): uppers = set(string.ascii_uppercase) all_tweets = tweepy.Cursor(api.search, q=term, count=100, lang='en').items(17900) tweets_table = [] entities_table = [] for tweet in all_tweets: tweet_info = ( tweet.id, tweet.text, str(tweet.created_at), tweet.place.country if tweet.place else None, tweet.favorite_count, tweet.retweet_count, term, sentiment( tweet.text. replace( # don't let vaderSentiment see non-neutral words in titles 'tsuki no', 'tsuki').replace('TSUKI NO', 'TSUKI').replace( 'tsuki No', 'tsuki').replace('Death ', '').replace( 'death parade', 'parade').replace( 'DEATH PARADE', 'PARADE').replace( 'Assassination ', '').replace('assassination classroom', 'classroom').replace( 'ASSASSINATION CLASSROOM', 'CLASSROOM'). replace('Cute ', '').replace('cute high', 'high').replace( 'CUTE HIGH', 'HIGH').replace('Club LOVE', 'Club').replace( 'Club Love', 'Club').replace( 'club love', 'club').replace( 'CLUB LOVE', 'CLUB').encode('utf-8', 'ignore'))['compound'], int( # Check For Jaden Smith-Style Capitalization all(x[0] in uppers for x in tweet.text.split() if x[0].isalpha() and '/' not in x)), # check if retweet int(tweet.text.startswith('RT @'))) extra_info = (tweet.id, [ x['text'] for x in tweet.entities['hashtags'] ], [x['screen_name'] for x in tweet.entities['user_mentions']], [x['expanded_url'] for x in tweet.entities['urls']]) tweets_table.append(tweet_info) entities_table.append(extra_info) return tweets_table, entities_table
def on_data(self, data): try: data_clone = json.loads(data) save_data = { "created_at": data_clone["created_at"], "text": data_clone["text"], "user": data_clone["user"]["screen_name"], "followers": data_clone["user"]["followers_count"], } print "Created at: %s by @%s\n%s\n" % (save_data["created_at"], save_data["user"], save_data["text"]) save_data["score"] = sentiment(save_data["text"].encode('utf-8')) for line in save_data["score"]: print line, save_data["score"][line] print "" except Exception, e: print "Error: %s.\n" % e
def gen_output(data, json_data_dir): term,is_reply,tweets_needed = data dataset = [] # get all user files files = glob.glob(os.path.join(json_data_dir,"*")) random.shuffle(files) for f in files: user = TwitterUser() user.populate_tweets_from_file(f,store_json=True,do_arabic_stemming=False,lemmatize=False) if 50 <= user.n_total_tweets <= 10000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: tweet_set = [t for t in user.tweets if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ len(t.tokens) > 5 and\ t.created_at >= MIN_TWEET_DATE and\ (term == '' or term in t.tokens) and\ langid.classify(t.text)[0] == 'en'and\ sentiment(t.text)['compound'] != 0] if is_reply: tweet_set = [t for t in tweet_set if t.reply_to] else: tweet_set = [t for t in tweet_set if not t.reply_to] if len(tweet_set) == 0: print 'size 0', term, tweets_needed, is_reply continue tweet = random.sample(tweet_set, 1)[0] print user.screen_name, term, tweets_needed, is_reply, ":::: ", tweet.text dataset.append(tweet) tweets_needed -= 1 if tweets_needed == 0: name = term if term != '' else 'random' name += '_reply' if is_reply else '_non_reply' pickle.dump(dataset,open(name+".p",'wb')) print 'done with: ',name, is_reply return else: print 'failed user'
def on_data(self, data): try: data_clone = json.loads(data) save_data = { "created_at": data_clone["created_at"], "text": data_clone["text"], "user": data_clone["user"]["screen_name"], "followers": data_clone["user"]["followers_count"], } print "Created at: %s by @%s\n%s\n" % ( save_data["created_at"], save_data["user"], save_data["text"]) save_data["score"] = sentiment(save_data["text"].encode('utf-8')) for line in save_data["score"]: print line, save_data["score"][line] print "" except Exception, e: print "Error: %s.\n" % e
def get_reviews_by_page(review_link_href, page, reviews, sentiments): all_reviews = '' overall_sentiment = 0 review_link_href = review_link_href + '&pageNumber=' + str(page) index = (page - 1) * 10 review_page_data = BeautifulSoup(get_page_by_url(review_link_href), 'html.parser') review_container_el = review_page_data.find("div", {"id": "cm_cr-review_list"}) if not(review_container_el is None): for reviews_el in review_container_el.find_all("div", {"class": "review"}): if not(reviews_el is None): reviews.append({}) review_data_el = reviews_el.find("span", {"class": "review-text"}) review_rating_el = reviews_el.find("i", {"class": "review-rating"}).find("span") if not(review_data_el is None) and not(review_rating_el is None): reviews_text = get_text(review_data_el) reviews[index]['text'] = reviews_text all_reviews += reviews_text review_sentiment = sentiment(reviews_text) reviews[index]['sentiment'] = review_sentiment review_rating = get_text(review_rating_el) reviews[index]['review_rating'] = review_rating overall_sentiment += review_sentiment['compound'] if '1.0' in review_rating: sentiments[0]['data'].append([review_sentiment['compound'], 1.0]) if '2.0' in review_rating: sentiments[1]['data'].append([review_sentiment['compound'], 2.0]) if '3.0' in review_rating: sentiments[2]['data'].append([review_sentiment['compound'], 3.0]) if '4.0' in review_rating: sentiments[3]['data'].append([review_sentiment['compound'], 4.0]) if '5.0' in review_rating: sentiments[4]['data'].append([review_sentiment['compound'], 5.0]) index += 1 return all_reviews, overall_sentiment
def topNTweets(tweets, n = 100, posThresh = 0.7): posScores = np.zeros(len(tweets)+1) negScores = np.zeros(len(tweets)+1) tweetID = [0]*(len(tweets)+1) i = 0 for t in tweets: #if not t.in_reply_to_status_id: #continue sent = sentiment(t.text.encode("utf-8")) if sent['neu'] == 1.0: continue posScores[i] = sent['pos']/(sent['pos']+sent['neg']) tweetID[i] = t.id i += 1 posScores = posScores[posScores>posThresh] topNIndices = np.argsort(posScores)[::-1][:n] topN = np.array(tweetID)[topNIndices] return topN
def prepare_news_data_vader(news): ''' Get sentiment scores for all 25 headlines on each day Input: raw news table Output: rows = # unique dates, cols = 25 (one for each headline) ''' news = news.copy() #sort by date and assign top id to each item news['News'] = news['News'].apply(lambda x: x.lower().replace( ',', '').replace('.', '')) #convert to lower case and some trimming news.sort('Date', ascending=True, inplace=True) news['sentiment_score'] = news['News'].apply( lambda x: sentiment(x)['compound']) #rank index for each day news_np = np.array(news) #each row is [Date, news headline, score] news_np_score = [] counter = 1 current_date = -1 for i in xrange(news_np.shape[0]): row = news_np[i] if row[0] != current_date: current_date = row[0] counter = 1 else: counter += 1 news_np_score.append(np.append(row, counter)) news_score = pd.DataFrame( news_np_score, columns=['Date', 'News', 'SentimentScore', 'Rank']) news_score = pd.pivot_table(news_score, values='SentimentScore', index='Date', columns='Rank') news_score = news_score.iloc[:, 0:25] #two days have 50 articles news_score.columns = ['col_vader_{}'.format(i) for i in news_score.columns] return news_score
def averageScore(listOfTweets): #Receives a list of tweets and returns the average sentiment for each averages = {'pos': 0, 'neu': 0, 'neg': 0} for tweet in listOfTweets: sent = sentiment(tweet.text.encode("utf-8")) averages['pos'] += sent['pos'] averages['neu'] += sent['neu'] averages['neg'] += sent['neg'] averages['pos'] /= len(listOfTweets) averages['neu'] /= len(listOfTweets) averages['neg'] /= len(listOfTweets) pos_norm = averages['pos'] / (averages['pos'] + averages['neg']) neg_norm = averages['neg'] / (averages['pos'] + averages['neg']) result = [pos_norm, neg_norm] return result
def averageScore(listOfTweets): #Receives a list of tweets and returns the average sentiment for each averages = {'pos': 0, 'neu': 0, 'neg': 0} for tweet in listOfTweets: sent = sentiment(tweet.text.encode("utf-8")) averages['pos'] += sent['pos'] averages['neu'] += sent['neu'] averages['neg'] += sent['neg'] averages['pos'] /= len(listOfTweets) averages['neu'] /= len(listOfTweets) averages['neg'] /= len(listOfTweets) pos_norm = averages['pos']/(averages['pos']+averages['neg']) neg_norm = averages['neg']/(averages['pos']+averages['neg']) result = [pos_norm, neg_norm] return result
def on_status(self, status): """ Append sentiment values of each tweet to a size-limited array. """ try: now = time() parsed_tweet = {} ds = set(dir(status)) parsed_tweet["is_rt"] = "retweeted_status" in ds parsed_tweet["timestamp"] = mktime(status.created_at.timetuple()) parsed_tweet["sentiment"] = sentiment( status.text.encode('utf-8'))["compound"] parsed_tweet["text"] = status.text.encode('utf-8') parsed_tweet["related_links"] = [ lnk["url"].replace("\\", "") + "||" + lnk["display_url"].replace("\\", "") for lnk in status.entities["urls"] ] if len(status.entities["hashtags"]) == 0: # handle case of tweet with no hashtags parsed_tweet["related_hashtags"] = [] parsed_tweet["hashtag"] = "(No Hashtag)" tweet_array.append((now, copy.deepcopy(parsed_tweet))) else: # separate stream entry for each hashtag for ht in status.entities["hashtags"]: parsed_tweet["related_hashtags"] = [ lnk["text"] for lnk in status.entities["hashtags"] if lnk["text"] != ht["text"] ] parsed_tweet["hashtag"] = ht["text"] tweet_array.append((now, copy.deepcopy(parsed_tweet))) except KeyboardInterrupt: return False except Exception, e: print "Exception!", type(e).__name__, e
print datasets_to_collect for f in files: user = TwitterUser(filename_for_tweets=f) if user.n_total_tweets < 10000 and user.n_total_tweets > 50 and\ user.followers_count < 25000 and user.creation_date <= MIN_ACCOUNT_AGE: tweet_set = [t for t in user.tweets if t.retweeted is None and\ len(t.urls) == 0 and\ len(t.tokens) > 5 and\ t.created_at <= MIN_TWEET_DATE and\ curr_dataset[0] in t.tokens and\ langid.classify(t.text)[0] == 'en'and\ sentiment(t.text)['compound'] != 0] if len(tweet_set) == 0: continue tweet = random.sample(tweet_set, 1)[0] print user.screen_name, curr_dataset[0:2], ":::: ", tweet.text curr_dataset[2].append(tweet) curr_dataset[1] -= 1 if curr_dataset[1] == 0: pickle.dump(curr_dataset[2], open(curr_dataset[0] + ".p", 'wb')) if len(datasets_to_collect) == 1: print 'DONE!!!' break datasets_to_collect = datasets_to_collect[1:] curr_dataset = datasets_to_collect[0]
def add_comments_history_correlation_features(): """ """ from pymongo import MongoClient from tqdm import tqdm from datetime import datetime from RMP_metadata import interest_lookup from vaderSentiment.vaderSentiment import sentiment from scipy.stats import pearsonr from pdb import set_trace set_trace() rmpdb = MongoClient('mongodb://localhost:27017')['rmpdb'] ds_cur = rmpdb['dataset_profs'].find({}, {'prof_id': 1}, no_cursor_timeout=True) for row in tqdm(ds_cur): prof_id = row['prof_id'] comments_cur = rmpdb['comments'].find({'prof_id': prof_id}, { '_id': 0, 'rClarity': 1, 'rEasy': 1, 'rHelpful': 1, 'rInterest': 1, 'rComments': 1, 'rDate': 1 }) comments = list() if comments_cur.count() > 2: for comment in comments_cur: comments.append(comment) comments_by_date = sorted( comments, key=lambda x: datetime.strptime(x['rDate'], '%m/%d/%Y')) help_list = list() clar_list = list() ease_list = list() interest_list = list() comment_positivity_list = list() comment_negativity_list = list() for comment in comments_by_date: help_list.append(float(comment['rHelpful'])) clar_list.append(float(comment['rClarity'])) ease_list.append(float(comment['rEasy'])) if comment['rInterest'] in interest_lookup: interest_list.append(interest_lookup[comment['rInterest']]) sentiments = sentiment(comment['rComments'].encode('utf-8')) comment_positivity_list.append(sentiments['pos']) comment_negativity_list.append(sentiments['neg']) pparam1 = list() pparam2 = list() for i in range(1, len(help_list)): pparam1.append(float(sum(help_list[:i])) / len(help_list[:i])) pparam2.append(help_list[i]) help_history_corr, help_p_val = pearsonr(pparam1, pparam2) pparam1 = list() pparam2 = list() for i in range(1, len(ease_list)): pparam1.append(float(sum(ease_list[:i])) / len(ease_list[:i])) pparam2.append(ease_list[i]) ease_history_corr, ease_p_val = pearsonr(pparam1, pparam2) pparam1 = list() pparam2 = list() for i in range(1, len(clar_list)): pparam1.append(float(sum(clar_list[:i])) / len(clar_list[:i])) pparam2.append(clar_list[i]) clar_history_corr, clar_p_val = pearsonr(pparam1, pparam2) pparam1 = list() pparam2 = list() for i in range(1, len(comment_positivity_list)): pparam1.append( float(sum(comment_positivity_list[:i])) / len(comment_positivity_list[:i])) pparam2.append(comment_positivity_list[i]) comment_positivity_corr, comment_positivity_p_val = pearsonr( pparam1, pparam2) pparam1 = list() pparam2 = list() for i in range(1, len(comment_negativity_list)): pparam1.append( float(sum(comment_negativity_list[:i])) / len(comment_negativity_list[:i])) pparam2.append(comment_negativity_list[i]) comment_negativity_corr, comment_negativity_p_val = pearsonr( pparam1, pparam2) insertion_dict = { '17': clar_history_corr, '18': ease_history_corr, '19': help_history_corr, '20': comment_positivity_corr, '21': comment_negativity_corr } if len(interest_list) > 2: pparam1 = list() pparam2 = list() for i in range(1, len(interest_list)): pparam1.append( float(sum(interest_list[:i])) / len(interest_list[:i])) pparam2.append(interest_list[i]) interest_history_corr, interest_p_val = pearsonr( pparam1, pparam2) insertion_dict['22'] = interest_history_corr rmpdb['dataset_profs'].update_one({'_id': row['_id']}, {'$set': insertion_dict})
def vaderSentiScore(self, doc): result = vs.sentiment(doc) return result
print datasets_to_collect for f in files: user = TwitterUser(filename_for_tweets=f) if user.n_total_tweets < 10000 and user.n_total_tweets > 50 and\ user.followers_count < 25000 and user.creation_date <= MIN_ACCOUNT_AGE: tweet_set = [t for t in user.tweets if t.retweeted is None and\ len(t.urls) == 0 and\ len(t.tokens) > 5 and\ t.created_at <= MIN_TWEET_DATE and\ curr_dataset[0] in t.tokens and\ langid.classify(t.text)[0] == 'en'and\ sentiment(t.text)['compound'] != 0] if len(tweet_set) == 0: continue tweet = random.sample(tweet_set, 1)[0] print user.screen_name, curr_dataset[0:2], ":::: ", tweet.text curr_dataset[2].append(tweet) curr_dataset[1] -= 1 if curr_dataset[1] == 0: pickle.dump(curr_dataset[2],open(curr_dataset[0]+".p",'wb')) if len(datasets_to_collect) == 1: print 'DONE!!!' break datasets_to_collect = datasets_to_collect[1:] curr_dataset = datasets_to_collect[0]
def vader_sentiment(text): text = text.decode("ascii", errors="ignore") return vader.sentiment(text)["compound"]
def sentiment_negative(tweet_text, language): if language is "en": return sentiment(tweet_text)['neg'] else: return float(0)
def sentiment_compound(tweet_text, language): if language is "en": return sentiment(tweet_text)['compound'] else: return float(0)
numUnique = len(set(words)) numTotal = len(words) return ((1.0 * numUnique)/numTotal) for status in statuses: corpus = [] for w in status['text'].split(): w = removeUnicode(w) if w in skips: continue if 'http' in w: continue if '&' in w: continue if '>' in w: continue if 'RT' in w: continue corpus.append(w) unique = set(corpus) senti = sentiment(status['text'].encode('utf-8')) overall_sentiment += float(senti['compound']) print "User:"******"Favorite Count:", str(status['favorite_count']) print "Tweet:", removeUnicode(status['text']) print "Lexical Diversity: ", getLexicalDiversity(removeUnicode(status['text'])) print "Retweet Count: ", status['retweet_count'] print "Compound Sentiment:", senti['compound'] print "Corpus:" for w in corpus: print '\t' + w print "Unique tokens:" for w in unique: print '\t' + w print "-----" print "Sentiment Summation: %f" % overall_sentiment
pklfile = open('generated_files/restaurants.pkl', 'rb') restaurants = pickle.load(pklfile) pklfile.close() pklfile = open('generated_files/reviews_user.pkl', 'rb') reviews_user = pickle.load(pklfile) pklfile.close() pklfile = open('generated_files/users.pkl', 'rb') users = pickle.load(pklfile) pklfile.close() snooty = [] allwords = [] mytest = reviews.values()[0] for i in range(len(mytest)): review = mytest[i]['text'] # print review # ptreview = plaintext(review['text']).encode('utf-8') # word list words = review.split() # print words allwords += words snooty += [w for w in words if w in PROFANITY] # sentiment print review vs = sentiment(review) print str(vs), '\n', mytest[i]['stars'], '\n', mytest[i]['votes'], '\n'
def scrape_ebay_site(url, config): # url = 'http://www.ebay.com/itm/Nikon-AF-S-DX-NIKKOR-55-200mm-f-4-5-6G-ED-VR-II-Lens-Factory-Refurbished-/ # 311498162380?_trkparms=%26rpp_cid%3D5702b40de4b0826387589b2e%26rpp_icid%3D5702cf3fe4b079ecf2fa287f' r = requests.get(url) data = r.text soup = BeautifulSoup(data, 'html.parser') json_data = {} title_el = soup.find("h1", {"id": "itemTitle"}) if not(title_el is None): json_data['title'] = title_el.text.strip('Details about') price_el = soup.find("span", {"id": "mm-saleDscPrc"}) or soup.find("span", {"id": "prcIsum"}) or \ soup.find("span", {"id": "prcIsum_bidPrice"}) if not (price_el is None): json_data['price'] = price_el.text.strip().lstrip('US ') savings_el = soup.find("span", {"id": "youSaveSTP"}) if not (savings_el is None): json_data['savings'] = savings_el.text.strip() sold_quantity_el = soup.find("span", {"class": "w2b-sgl"}) if not (sold_quantity_el is None): json_data['sold_quantity'] = sold_quantity_el.text.strip().rstrip(" sold") shipping_el = soup.find("span", {"id": "fshippingCost"}) if not (shipping_el is None): json_data['shippingCost'] = shipping_el.text.strip() shipping_to_el = soup.find("div", {"class": "sh-sLoc"}) if not (shipping_to_el is None): json_data['shippingTo'] = shipping_to_el.text.strip().lstrip("Shipping to: ") image_el = soup.find("img", {"id": "icImg"}) if not(image_el is None): json_data['image'] = image_el.attrs["src"] brand_el = soup.find("h2", {"itemprop": "brand"}) if not(brand_el is None): json_data['brand'] = brand_el.text json_data['merchant'] = {} merchant_el = soup.find("span", {"class": "mbg-nw"}) if not (merchant_el is None): json_data['merchant']['name'] = merchant_el.text.title() merchant_sold_quantity = soup.find("span", {"class": "mbg-l"}) if not (merchant_sold_quantity is None): json_data['merchant']['sold_quantity'] = merchant_sold_quantity.text.strip().lstrip('(').rstrip(')') merchant_feedback = soup.find("div", {"id": "si-fb"}) if not (merchant_feedback is None): json_data['merchant']['feedback'] = merchant_feedback.text savings_el = soup.find("div", {"id": "mm-saleAmtSavedPrc"}) if not (savings_el is None): json_data['savings'] = savings_el.text.strip() availability_el = soup.find("span", {"id": "qtySubTxt"}) if not(availability_el is None): json_data['availability'] = availability_el.text.strip() return_in_el = soup.find("span", {"id": "vi-ret-accrd-txt"}) if not (return_in_el is None): json_data['returnIn'] = return_in_el.text.strip() seller_fb_el = soup.find("div", {"id": "si-fb"}) if not (seller_fb_el is None): json_data['sellerFeedback'] = seller_fb_el.text.strip() link_to_buy_el = soup.find("a", {"id": "binBtn_btn"}) if not (link_to_buy_el is None): json_data['linkToBuy'] = link_to_buy_el.text.strip() payment_el_row = soup.find("div", {"id": "payDet1"}) if not (payment_el_row is None): json_data['payment'] = {} index = 1 for paymentEl in payment_el_row.find_all("img"): if not (paymentEl is None): json_data['payment'][index] = paymentEl.attrs['alt'] index += 1 json_data['rating'] = {} rating_el = soup.find("span", {"class": "num-of-rewiews"}) if not (rating_el is None): review_count_el = rating_el.find("a") if not(review_count_el is None): json_data['rating']['review_count'] = review_count_el.text.strip().rstrip(' rating').rstrip('s') if json_data['rating'].get('review_count') is None: rating_el = soup.find("a", {"id": "_rvwlnk"}) if not (rating_el is None): json_data['rating']['review_count'] = int(rating_el.text.strip().rstrip('s').rstrip(' rating')) average_rating_el = soup.find("span", {"class": "review--start--rating"}) or \ soup.find("span", {"class": "ebay-review-start-rating"}) if not(average_rating_el is None): json_data['rating']['average'] = average_rating_el.text.strip() review_summary_el = soup.find('ul', {'class': 'ebay-review-list'}) if not(review_summary_el is None): json_data['rating']['stats'] = {} for review_row_el in review_summary_el.find_all('li', {'class': 'ebay-review-item'}): if not(review_row_el is None): review_name_el = review_row_el.find("p", {"class": "ebay-review-item-stars"}) review_rating_el = review_row_el.find("div", {"class": "ebay-review-item-r"}).find("span") if review_name_el and review_rating_el: json_data['rating']['stats'][review_name_el.text + ' star'] = review_rating_el.text reviews_el_block = soup.find("div", {"class": "reviews"}) if not(reviews_el_block is None): json_data['reviews'] = {} index = 1 overall_sentiment = 0 all_reviews = '' for reviewsEl in reviews_el_block.find_all("div", {"class": "ebay-review-section"}): if not(reviewsEl is None): json_data['reviews'][index] = {} reviewed_by_el = reviewsEl.find("a", {"itemprop": "author"}) if not(reviewed_by_el is None): json_data['reviews'][index]['reviewed_by'] = reviewed_by_el.text.strip() review_rating_el = reviewsEl.find("div", {"class": "ebay-star-rating"}) if not(review_rating_el is None): json_data['reviews'][index]['review_rating'] = (review_rating_el.attrs.get("title") or review_rating_el.attrs.get("aria-label")).strip() review_name_el = reviewsEl.find("p", {"itemprop": "name"}) if not(review_name_el is None): json_data['reviews'][index]['title'] = review_name_el.text.strip() review_description_el = reviewsEl.find("p", {"itemprop": "reviewBody"}) if not(review_description_el is None): reviews_text = review_description_el.text.strip() json_data['reviews'][index]['text'] = reviews_text review_sentiment = sentiment(reviews_text) all_reviews += reviews_text overall_sentiment += review_sentiment['compound'] reviewed_on_el = reviewsEl.find("span", {"itemprop": "datePublished"}) if not(reviewed_on_el is None): json_data['reviews'][index]['reviewed_on'] = reviewed_on_el.text.strip() review_attributes_el_block = reviewsEl.find("p", {"class": "review-attr"}) if not(review_attributes_el_block is None): attribute_index = 0 for reviewAttributesEl in review_attributes_el_block.find_all("span", {"class": "rvw-attr"}): if not(reviewAttributesEl is None): review_value_el = review_attributes_el_block.select("span.rvw-val")[attribute_index] if not(review_value_el is None): json_data['reviews'][index][reviewAttributesEl.text.strip()] = \ review_value_el.text.strip() attribute_index += 1 index += 1 if all_reviews != '': json_data['tones'] = get_tone(all_reviews, config) json_data['overall_sentiment'] = round(overall_sentiment/index, 2) data = { 'scraped_data': json_data } # print(product_data) return data
def __init__( self, unit_id, date, sentiment_ids_map, identity_ids_map, gram_list, emoji_info, emoticon_to_eval_dim, hashtag_epa_data, vader_dict, dependency_parsed_conll=None, dependency_parsed_objects=None, raw_text=None, sent_values=None, verbose=False, node_must_be_identity=False, use_events=True, use_behaviors=True, use_isa=True, use_parent_child=True, use_own_full_sentence=True, use_clause_level=True, do_negation_on_full_sentence=True, ): """ :param sentiment_ids_map: mapping from sentiment word we care about to its id :param identity_ids_map: mapping from identity we care about to its id :return: a map from identity ids to sentiment constraints on that identity :param dependency_parsed_conll: :param dependency_parsed_objects: :param raw_text: :return: """ if not raw_text and not dependency_parsed_conll and not dependency_parsed_objects: raise Exception( "you didnt provide any data to the TextUnit constructor") self.unit_id = unit_id self.date = date # params for whether or not to use, e.g, behavioral constraints self.use_events = use_events self.use_behaviors = use_behaviors self.use_isa = use_isa self.use_parent_child = use_parent_child self.use_own_full_sentence = use_own_full_sentence self.use_clause_level = use_clause_level self.do_negation_on_full_sentence = do_negation_on_full_sentence #self.all_identity_words_to_epa = all_identity_words_to_epa ### MEAN FROM UGA DATA sent_values[ZERO_IDENTITY_INDICATOR + 'e'] = 0.0 sent_values[ZERO_IDENTITY_INDICATOR + 'p'] = 0.0 sent_values[ZERO_IDENTITY_INDICATOR + 'a'] = 0.0 sentiment_ids_map[ZERO_IDENTITY_INDICATOR] = ZERO_IDENTITY_INDICATOR self.node_must_be_identity = node_must_be_identity self.identity_ids_map = identity_ids_map self.sentiment_ids_map = sentiment_ids_map if emoji_info: self.emojis_to_eval_dim = emoji_info[0] self.emoji_regex = emoji_info[1] else: self.emoticon_to_eval_dim = self.emoji_regex = None self.emoticon_to_eval_dim = emoticon_to_eval_dim self.hashtag_to_epa = hashtag_epa_data self.gram_list = gram_list self.verbose = verbose # for sentence-level E, P, A constraints using emojis, emoticons, hashtags self.sentence_level_e = list() self.sentence_level_p = list() self.sentence_level_a = list() # for debugging purposes, a human-readable view of the constraints in this sentence self.constraint_string_list = [] self.thot_words = [] # list of all constraints self.all_constraints = [] # the identities in this tweet (binary yes/no) self.identities = [] # to ensure no multiple constraints are added self.sentence_ids_to_constraints = defaultdict(set) if raw_text: self.raw_text = raw_text constraints_map = self.get_constraints_from_raw_text(raw_text) elif dependency_parsed_conll: constraints_map = self.get_constraints_from_conll( dependency_parsed_conll, sent_values) else: constraints_map = self.get_constraints_from_dep_objs( dependency_parsed_objects, sent_values) # store the identity ids for easy retrieval, but only those in our identity set iden_set = set(self.identity_ids_map.values()) # self.identities += constraints_map.keys() self.identities = list( set([x for x in self.identities if x in iden_set])) # construct sentence-level constraint if vader_dict: self.sentence_level_e.append( sentiment(self.raw_text, vader_dict, 2.)['compound']) sent_e_value = self.get_value_for_constraint_from_list( self.sentence_level_e) sent_p_value = self.get_value_for_constraint_from_list( self.sentence_level_p) sent_a_value = self.get_value_for_constraint_from_list( self.sentence_level_a) # e will always have at least 1 if there are any sentence level vars if sent_e_value: self.constraint_string_list.append( "SENTENCE LEVEL: E: {e} P: {p} A: {a}".format(e=sent_e_value, p=sent_p_value, a=sent_a_value)) for identity in set(self.identities): sl = SentenceLevelConstraint(identity, sent_e_value, sent_p_value, sent_a_value) constraints_map[identity].append(sl) self.all_constraints.append(sl) # construct the full deflection equation self.full_deflection_string = " + ".join( [c.get_constraint_string() for c in self.all_constraints]) if len(self.full_deflection_string): self.full_deflection_string = SENT_REPLACE_REGEX.sub( lambda x: str(sent_values[x.group()]), self.full_deflection_string) self.full_deflection_string = str( sympify(self.full_deflection_string)) self.full_deflection_string = ADD_UV_REGEX.sub( lambda x: "uv." + x.group(0), self.full_deflection_string) # store constraint strings for each identity self.identities_to_constraint_string_map = {} for identity, constraint_list in constraints_map.items(): # if this isn't in the set we care about, keep going if identity not in iden_set: continue eq_constr = [ constraint.get_constraint_string() for constraint in constraint_list ] equation_str = "+".join(eq_constr) equation_str = SENT_REPLACE_REGEX.sub( lambda x: str(sent_values[x.group()]), equation_str) constraint = sympify(equation_str) for val in ['e', 'p', 'a']: p = poly(constraint, Symbol(identity + val)).all_coeffs() if len(p) != 3: if val == 'e': print 'CONSTRAINT DIDNT WORK!!!!' # print "\n".join(dependency_parsed_conll) continue p_0 = "+".join([ "*".join([str(key), FLOAT_FORMAT.format(float(v))]) for key, v in p[0].as_coefficients_dict().items() ]) # try to make it a float, if it doesn't work, then it has variables in it try: p_0 = eval(p_0) except: p_0 = ADD_UV_REGEX.sub(lambda x: "uv." + x.group(0), p_0) p_1 = "+".join([ "*".join([str(key), FLOAT_FORMAT.format(float(v))]) for key, v in p[1].as_coefficients_dict().items() ]) try: p_1 = eval(p_1) except: p_1 = ADD_UV_REGEX.sub(lambda x: "uv." + x.group(0), p_1) self.identities_to_constraint_string_map[identity + val] = [p_0, p_1] # release these things to save memory self.identity_ids_map = None #self.all_identity_words_to_epa = None self.sentiment_ids_map = None self.gram_list = None self.emojis_to_eval_dim = None self.emoji_regex = None self.emoticon_to_eval_dim = None self.hashtag_to_epa = None
return sentence D = {} D["amazon"] = [0,0,0,0] D["walmart"] = [0,0,0,0] with open('final.csv', 'rb') as f: mycsv = csv.reader(f) lamazon = 0 lwalmart = 0 for row in mycsv: txt = row[2] pre = preprocess(txt) vs = vaderSentiment.sentiment(pre.encode("utf8")) if row[0] in A: D["amazon"][0] += vs["neg"] D["amazon"][1] += vs["neu"] D["amazon"][2] += vs["pos"] D["amazon"][3] += vs["compound"] lamazon += 1 elif row[0] in W: D["walmart"][0] += vs["neg"] D["walmart"][1] += vs["neu"] D["walmart"][2] += vs["pos"] D["walmart"][3] += vs["compound"] lwalmart += 1 print D D["amazon"][0] = D["amazon"][0] /lamazon
def sentiment_positive(tweet_text, language): if language is "en": return sentiment(tweet_text)['pos'] else: return float(0)
numTotal = len(words) return ((1.0 * numUnique) / numTotal) for status in statuses: corpus = [] for w in status['text'].split(): w = removeUnicode(w) if w in skips: continue if 'http' in w: continue if '&' in w: continue if '>' in w: continue if 'RT' in w: continue corpus.append(w) unique = set(corpus) senti = sentiment(status['text'].encode('utf-8')) overall_sentiment += float(senti['compound']) print "User:"******"Favorite Count:", str(status['favorite_count']) print "Tweet:", removeUnicode(status['text']) print "Lexical Diversity: ", getLexicalDiversity( removeUnicode(status['text'])) print "Retweet Count: ", status['retweet_count'] print "Compound Sentiment:", senti['compound'] print "Corpus:" for w in corpus: print '\t' + w print "Unique tokens:" for w in unique: print '\t' + w print "-----"
def add_comments_history_correlation_features(): """ """ from pymongo import MongoClient from tqdm import tqdm from datetime import datetime from RMP_metadata import interest_lookup from vaderSentiment.vaderSentiment import sentiment from scipy.stats import pearsonr from pdb import set_trace set_trace() rmpdb = MongoClient('mongodb://localhost:27017')['rmpdb'] ds_cur = rmpdb['dataset_profs'].find({}, {'prof_id' : 1}, no_cursor_timeout = True) for row in tqdm(ds_cur): prof_id = row['prof_id'] comments_cur = rmpdb['comments'].find({'prof_id' : prof_id}, {'_id' : 0, 'rClarity' : 1, 'rEasy' : 1, 'rHelpful' : 1, 'rInterest' : 1, 'rComments' : 1, 'rDate' : 1}) comments = list() if comments_cur.count() > 2: for comment in comments_cur: comments.append(comment) comments_by_date = sorted(comments, key = lambda x: datetime.strptime(x['rDate'], '%m/%d/%Y')) help_list = list() clar_list = list() ease_list = list() interest_list = list() comment_positivity_list = list() comment_negativity_list = list() for comment in comments_by_date: help_list.append(float(comment['rHelpful'])) clar_list.append(float(comment['rClarity'])) ease_list.append(float(comment['rEasy'])) if comment['rInterest'] in interest_lookup: interest_list.append(interest_lookup[comment['rInterest']]) sentiments = sentiment(comment['rComments'].encode('utf-8')) comment_positivity_list.append(sentiments['pos']) comment_negativity_list.append(sentiments['neg']) pparam1 = list() pparam2 = list() for i in range(1, len(help_list)): pparam1.append(float(sum(help_list[:i])) / len(help_list[:i])) pparam2.append(help_list[i]) help_history_corr, help_p_val = pearsonr(pparam1, pparam2) pparam1 = list() pparam2 = list() for i in range(1, len(ease_list)): pparam1.append(float(sum(ease_list[:i])) / len(ease_list[:i])) pparam2.append(ease_list[i]) ease_history_corr, ease_p_val = pearsonr(pparam1, pparam2) pparam1 = list() pparam2 = list() for i in range(1, len(clar_list)): pparam1.append(float(sum(clar_list[:i])) / len(clar_list[:i])) pparam2.append(clar_list[i]) clar_history_corr, clar_p_val = pearsonr(pparam1, pparam2) pparam1 = list() pparam2 = list() for i in range(1, len(comment_positivity_list)): pparam1.append(float(sum(comment_positivity_list[:i])) / len(comment_positivity_list[:i])) pparam2.append(comment_positivity_list[i]) comment_positivity_corr, comment_positivity_p_val = pearsonr(pparam1, pparam2) pparam1 = list() pparam2 = list() for i in range(1, len(comment_negativity_list)): pparam1.append(float(sum(comment_negativity_list[:i])) / len(comment_negativity_list[:i])) pparam2.append(comment_negativity_list[i]) comment_negativity_corr, comment_negativity_p_val = pearsonr(pparam1, pparam2) insertion_dict = { '17' : clar_history_corr, '18' : ease_history_corr, '19' : help_history_corr, '20' : comment_positivity_corr, '21' : comment_negativity_corr} if len(interest_list) > 2: pparam1 = list() pparam2 = list() for i in range(1, len(interest_list)): pparam1.append(float(sum(interest_list[:i])) / len(interest_list[:i])) pparam2.append(interest_list[i]) interest_history_corr, interest_p_val = pearsonr(pparam1, pparam2) insertion_dict['22'] = interest_history_corr rmpdb['dataset_profs'].update_one({'_id' : row['_id']}, {'$set' :insertion_dict})
def sentiment_neutral(tweet_text, language): if language is "en": return sentiment(tweet_text)['neu'] else: return float(0)
finalanswer2[location] = 0 company_tweets = {} for c in company_keywords: company_tweets[c] = [] for content in _location_tweets[location]: for kw in company_keywords[c]: if kw in content: company_tweets[c].append(content) break for company in company_tweets: sentences = company_tweets[company] neg = 0.0 neu = 0.0 pos = 0.0 for sentence in sentences: vs = vaderSentiment.sentiment(sentence.encode("utf8")) neg += vs["neg"] neu += vs["neu"] pos += vs["pos"] finalanswer1[location].append([company,(neg+neu+pos)]) finalanswer2[location] += neg+neu+pos # finalanswer2 gives the happiness of states from most to least finalanswer2 = sorted(finalanswer2.items(), key=lambda x: (-x[1], x[0])) # top 5 -> each company for stateL in finalanswer2[:5]: print stateL[0] print finalanswer1[stateL[0]]
for line in f: users.append(line.strip()) with open("data/sandy_all.txt", 'rt') as f: for line in f: tweet = line.split('\t') # print(tweet) user = tweet[0] # is this user in the master list if user not in users: continue if user not in table.keys(): table[user] = {'total': 0, 'neutral': 0} # print('---') # print(tweet) ret = vaderSentiment.sentiment(tweet[1]) # print(ret) if ret['neu'] >= threshold: table[user]['neutral'] += 1 table[user]['total'] += 1 with open('results.txt', 'w') as f: for user in sorted(table, key=lambda x: int(x)): # print(user, str(float(table[user]['neutral']) / float(table[user]['total']))) # f.write(user + ' ' + str(float(table[user]['neutral']) / float(table[user]['total'])) + '\n') f.write(str(float(table[user]['neutral']) / float(table[user]['total'])) + '\n')
def scrape_rakuten_site(prod_url, config): r = requests.get(prod_url) data = r.text soup = BeautifulSoup(data, 'html.parser') json_data = {} title_el = soup.find("h1", {"id": "product-title-heading"}) if not(title_el is None): json_data['title'] = str(title_el.text).strip() price_el = soup.find("span", {"class": "price"}) if not (price_el is None): ##price_el = get_text(price_el) json_data['price'] = (price_el.text).strip() savings_el = soup.find("div", {"class": "text-muted"}) if not (savings_el is None): json_data['savings'] = (savings_el.text).strip() shipping_el = soup.find(text="+ free shipping") if not (shipping_el is None): shipping_el = 'FREE' json_data['shippingCost'] = shipping_el else: shipping_el = 'EXTRA' json_data['shippingCost'] = shipping_el image_el = soup.find("img", {"id": "productmain"}) if not(image_el is None): json_data['image'] = image_el.attrs["src"] brand_el = soup.find("td", {"class": "tab-table"}) if not(brand_el is None): json_data['brand'] = MfgPart_el.text merchant_el = soup.find("div", {"class": "seller"}) if not (merchant_el is None): json_data['merchant'] = str(merchant_el.text) availability_el = soup.find("strong", {"class": "text-success"}) if not(availability_el is None): json_data['availability'] = get_text(availability_el) # return_in_el = soup.find("span", {"id": "vi-ret-accrd-txt"}) # if not (return_in_el is None): # json_data['returnIn'] = (return_in_el.text).strip() prod_description_el = soup.find("div", {"itemprop": "description"}) if not (prod_description_el is None): json_data['ProdDescription'] = (prod_description_el.text).strip() link_to_buy_el = soup.find("a", {"a": "add-to-cart-main.add-to-cart.btn_btn"}) if not (link_to_buy_el is None): json_data['linkToBuy'] = (link_to_buy_el.text).strip() json_data['reviews'] = {} json_data['rating'] = {} average_rating_block_el = soup.find("div", {"class": "rating-summary"}) if not (average_rating_block_el is None): average_rating_el = average_rating_block_el.find("strong") if not(average_rating_el is None): json_data['rating']['average'] = get_text(average_rating_el) json_data['average'] = get_text(average_rating_el) #debug='jon' #print debug #print get_text(average_rating_el) #print debug review_count_el = soup.find("strong", {"class": "rating-indicator"}) if not (review_count_el is None): json_data['rating']['review_count'] = int(str((review_count_el.text).strip())) json_data['review_count'] = int(str((review_count_el.text).strip())) #user_ratingEl_block = soup.findAll("div", {"class": "review-avg"}) #json_data['rating']['stats'] = {} #if not(user_ratingEl_block is None): # index = 1 #for user_ratingEl in soup.findAll("div", {"class": "review-avg"}): # if not(user_ratingEl is None): # review_rating_el = user_ratingEl.findAll("i", {"class": "fa fa-circle"}) or user_ratingEl.findAll("i", {"class": "fa fa-circle"}) # if not(review_rating_el is None): # rating_score = len(review_rating_el) # rating_score = str(rating_score) # json_data['rating']['stats'][index+ ' star'] = rating_score # index +=1 statsblock_el = soup.findAll("div", {"id": "ratings"}) json_data['rating']['stats'] = {} index3 = 5 if not(statsblock_el is None): for stats_el in soup.findAll("div", {"class": "rating-indicator"}): debug = 'start' debug2 = 'close' #print debug rating_score = stats_el.contents[0] #print rating_score name = str(index3) #print name #print debug2 json_data['rating']['stats'][name+ ' star'] = str(rating_score) index3 -=1 reviews_el_block = soup.findAll("p", {"class": "dotdotdot"}) if not(reviews_el_block is None): json_data['reviews'] = {} index2 = 1 overall_sentiment = 0 for reviewsEl in soup.findAll("p", {"class": "dotdotdot"}): if not(reviewsEl is None): json_data['reviews'][index2] = {} #reviewed_by_el = reviewsEl.find("em", {"class": "review-avg"}) #if not(reviewed_by_el is None): # json_data['reviews'][index2]['reviewed_by'] = reviewed_by_el.text.strip() reviewDescription_el = reviewsEl.contents[0] ##debug = 'review' ##print reviewDescription_el ##print debug all_reviews = '' if not(reviewDescription_el is None) and not(reviewDescription_el is None): json_data['reviews'][index2]['text'] = str(reviewDescription_el) all_reviews += str(reviewDescription_el) review_sentiment = sentiment(str(reviewDescription_el)) overall_sentiment = review_sentiment index2 += 1 if all_reviews != '': all_tones = get_tone(all_reviews, config) json_data['tones'] = str(all_tones) db = 'start' db2 = 'end' print db print overall_sentiment print db2 json_data['overall_sentiment'] = overall_sentiment #soup.find('div', class_='detail_date').find('dt', #text='Date').find_next_sibling('dd').text value = soup.find(text="Overall Satisfaction").findNext('dt').contents[0] #if not(reviewDescription_el is None): # get_value = get_float(value) # get_value = str(get_value) ease = soup.find(text="Value").findNext('dt').contents[0] performance = soup.find(text="Ease of Use").findNext('dt').contents[0] over = soup.find(text="Overall Satisfaction").findPrevious('dt').contents[0] get_over = get_float(over) get_over = str(get_over) get_ease = get_float(ease) get_ease = str(get_ease) get_performance = get_float(performance) get_performance = str(get_performance) get_value = get_float(value) get_value = str(get_value) json_data['Over'] = get_over json_data['Value'] = get_value json_data['Ease of Use'] = get_ease json_data['Performance'] = get_performance data = { 'scraped_data': json_data } # print(product_data) return data