def __init__(self, a_document_name): # term counts: number of documents that contain a given term (key) self.term_counts = dict() self.reviews = dict() file = open(a_document_name) for review in file: review_id = TextProcess.read_line(review)['review_id'].encode('utf8') review_content = TextProcess.read_line(review)['text'].encode('utf8') self.reviews[review_id] = review_content tokens = TextProcess.stemming(TextProcess.stopword(TextProcess.tokenize(TextProcess.read_line(review)))) unique_tokens = list(set(tokens)) for token in unique_tokens: if token in self.term_counts: self.term_counts[token] += 1 else: self.term_counts[token] = 1 self.idf = dict() for term, term_count in self.term_counts.iteritems(): self.idf[term] = log(len(self.reviews)/term_count) self.tfidf_dict = dict() for review_id, review in self.reviews.iteritems(): tokens = TextProcess.stemming(TextProcess.stopword(re.findall(r'\w+',review.lower()))) tf = collections.Counter(tokens) review_tfidfs = list() for term, term_count in self.term_counts.iteritems(): if term in tf: review_tfidfs.append((1+log(tf[term]))*self.idf[term]) else: review_tfidfs.append(0) self.tfidf_dict[review_id] = review_tfidfs
def text_preproc(text): text = TextProcess.shrink_whitespace(text) text = TextProcess.tolower(text) text = TextProcess.remove_html(text) text = TextProcess.remove_url(text) text = TextProcess.remove_number(text) text = TextProcess.remove_punctuation(text) text = TextProcess.remove_stopword(text) text = TextProcess.shrink_empty_line(text) return text
def text_preproc(text): text = TextProcess.shrinkWhitespace(text) text = TextProcess.toLower(text) text = TextProcess.removeHTML(text) text = TextProcess.removeURL(text) text = TextProcess.removeNumber(text) text = TextProcess.removePunctuation(text) text = TextProcess.removeStopword(text) text = TextProcess.shrinkEmptyLine(text) return text
def getRestaurants(): restaurants = [] inFileName='restaurants.json' f=open(inFileName,'r') for i in range(0, 14303): rest = TextProcess.read_line(f.readline()) restaurants.append(rest) return restaurants
def calc_tweet_scores(self): '''calculate relevancy of tweets - 20% from favorite count - 30% from retweet count - 30% from number of followers_count - 20% from user's total number of tweets (these amounts may be adjusted later)''' #score = 0 with open('test.json','w') as f1: json.dump(self.tweets,f1) for word in self.tweets: #print word sortedList =[] count = 0 totalscore = 0 s = self.tweets[word] with open('test2.json','w') as f1: json.dump(s,f1) for t in self.tweets[word]: # adds up to 100% #print t #print stuff #for t in stuff: #print "t is:" #print t['favorite_count'] print count #t = s[count] if word in t['text']: #print word #print unicode(t['text']) score = 0 score += math.log(t['favorite_count']+1,2) * 0.25 score += math.log(t['retweet_count']+1,2) * 0.025 score += math.log(t['user']['followers_count']+1,2) * 0.05 #score += math.log(t['user']['statuses_count']+1,2) * 0.05 totalscore += score #stemming the texts tokens = TextProcess.tokenize(t['text']) #list_of_stem_words = TextProcess.stemming(tokens) text = ' '.join(tokens).strip() self.scores.append({ 'id': t['id'], 'text':unicode(text,errors='ignore'), 'score' : score, 'geo':t['geo']['coordinates']}) #print self.scores count+=1 if (totalscore >=0): sortedList = sorted(self.scores, key = lambda k: k['score'], reverse=True)[0:100] sortedList2 = sorted(sortedList, key = lambda k: k['geo'][1], reverse=True) if word not in self.sortedTweets: self.sortedTweets[word] = sortedList2 else: self.sortedTweets[word] += sortedList2 self.scores = []
def __init__(self, a_document_name): # term counts: number of documents that contain a given term (key) self.term_counts = dict() self.reviews = dict() file = open(a_document_name) for review in file: review_id = TextProcess.read_line(review)['review_id'].encode( 'utf8') review_content = TextProcess.read_line(review)['text'].encode( 'utf8') self.reviews[review_id] = review_content tokens = TextProcess.stemming( TextProcess.stopword( TextProcess.tokenize(TextProcess.read_line(review)))) unique_tokens = list(set(tokens)) for token in unique_tokens: if token in self.term_counts: self.term_counts[token] += 1 else: self.term_counts[token] = 1 self.idf = dict() for term, term_count in self.term_counts.iteritems(): self.idf[term] = log(len(self.reviews) / term_count) self.tfidf_dict = dict() for review_id, review in self.reviews.iteritems(): tokens = TextProcess.stemming( TextProcess.stopword(re.findall(r'\w+', review.lower()))) tf = collections.Counter(tokens) review_tfidfs = list() for term, term_count in self.term_counts.iteritems(): if term in tf: review_tfidfs.append((1 + log(tf[term])) * self.idf[term]) else: review_tfidfs.append(0) self.tfidf_dict[review_id] = review_tfidfs
print "start training" classifier = NaiveBayesClassifier.train(train_features) reviewList = [] confList = [] reviewContent = {} print "starting sentiment" cutoff = 0 for line in open('../../data/yelp_academic_dataset_review.json', 'r'): #cutoff+=1 #if cutoff > 10: # break review_json = TextProcess.read_line(line) review_id = review_json['review_id'] chopPoint = review_json['text'][::-1].find(' ', 400) if chopPoint > 400: reviewContent[review_id] = review_json['text'][::-1][:chopPoint][::-1] posScore = classifier.posScore(word_features(TextProcess.tokenize(review_json, chopPoint))) negScore = classifier.negScore(word_features(TextProcess.tokenize(review_json, chopPoint))) else: reviewContent[review_id] = review_json['text'] posScore = classifier.posScore(word_features(TextProcess.tokenize(review_json, len(review_json['text'])))) negScore = classifier.negScore(word_features(TextProcess.tokenize(review_json, len(review_json['text'])))) reviewList.append((review_id,posScore)) confList.append((review_id, abs(posScore-negScore))) print "done with sentiment"
from __future__ import division __author__ = "jtgoen" import json from utility import TextProcess business_dict = dict() review_dict = dict() user_dict = dict(dict()) for line in open("yelp_academic_dataset_business.json", "r"): business_json = TextProcess.read_line(line) business_dict[business_json["business_id"]] = business_json for line in open("yelp_academic_dataset_review.json", "r"): review_json = TextProcess.read_line(line) review_dict[review_json["review_id"]] = review_json sentiment_dict = json.loads(open("reviewSentimentStars.json").read()) confidence_dict = json.loads(open("reviewConfidence.json").read()) for review in review_dict: if review_dict[review]["user_id"] not in user_dict: user_id = review_dict[review]["user_id"] user_dict[user_id] = dict( value=abs(review_dict[review]["stars"] - sentiment_dict[review_dict[review]["review_id"]]), reviews=1.0 ) else: user_dict[review_dict[review]["user_id"]]["value"] += abs(