def __init__(self, a_document_name): # term counts: number of documents that contain a given term (key) self.term_counts = dict() self.reviews = dict() file = open(a_document_name) for review in file: review_id = TextProcess.read_line(review)['review_id'].encode('utf8') review_content = TextProcess.read_line(review)['text'].encode('utf8') self.reviews[review_id] = review_content tokens = TextProcess.stemming(TextProcess.stopword(TextProcess.tokenize(TextProcess.read_line(review)))) unique_tokens = list(set(tokens)) for token in unique_tokens: if token in self.term_counts: self.term_counts[token] += 1 else: self.term_counts[token] = 1 self.idf = dict() for term, term_count in self.term_counts.iteritems(): self.idf[term] = log(len(self.reviews)/term_count) self.tfidf_dict = dict() for review_id, review in self.reviews.iteritems(): tokens = TextProcess.stemming(TextProcess.stopword(re.findall(r'\w+',review.lower()))) tf = collections.Counter(tokens) review_tfidfs = list() for term, term_count in self.term_counts.iteritems(): if term in tf: review_tfidfs.append((1+log(tf[term]))*self.idf[term]) else: review_tfidfs.append(0) self.tfidf_dict[review_id] = review_tfidfs
def calc_tweet_scores(self): '''calculate relevancy of tweets - 20% from favorite count - 30% from retweet count - 30% from number of followers_count - 20% from user's total number of tweets (these amounts may be adjusted later)''' #score = 0 with open('test.json','w') as f1: json.dump(self.tweets,f1) for word in self.tweets: #print word sortedList =[] count = 0 totalscore = 0 s = self.tweets[word] with open('test2.json','w') as f1: json.dump(s,f1) for t in self.tweets[word]: # adds up to 100% #print t #print stuff #for t in stuff: #print "t is:" #print t['favorite_count'] print count #t = s[count] if word in t['text']: #print word #print unicode(t['text']) score = 0 score += math.log(t['favorite_count']+1,2) * 0.25 score += math.log(t['retweet_count']+1,2) * 0.025 score += math.log(t['user']['followers_count']+1,2) * 0.05 #score += math.log(t['user']['statuses_count']+1,2) * 0.05 totalscore += score #stemming the texts tokens = TextProcess.tokenize(t['text']) #list_of_stem_words = TextProcess.stemming(tokens) text = ' '.join(tokens).strip() self.scores.append({ 'id': t['id'], 'text':unicode(text,errors='ignore'), 'score' : score, 'geo':t['geo']['coordinates']}) #print self.scores count+=1 if (totalscore >=0): sortedList = sorted(self.scores, key = lambda k: k['score'], reverse=True)[0:100] sortedList2 = sorted(sortedList, key = lambda k: k['geo'][1], reverse=True) if word not in self.sortedTweets: self.sortedTweets[word] = sortedList2 else: self.sortedTweets[word] += sortedList2 self.scores = []
def __init__(self, a_document_name): # term counts: number of documents that contain a given term (key) self.term_counts = dict() self.reviews = dict() file = open(a_document_name) for review in file: review_id = TextProcess.read_line(review)['review_id'].encode( 'utf8') review_content = TextProcess.read_line(review)['text'].encode( 'utf8') self.reviews[review_id] = review_content tokens = TextProcess.stemming( TextProcess.stopword( TextProcess.tokenize(TextProcess.read_line(review)))) unique_tokens = list(set(tokens)) for token in unique_tokens: if token in self.term_counts: self.term_counts[token] += 1 else: self.term_counts[token] = 1 self.idf = dict() for term, term_count in self.term_counts.iteritems(): self.idf[term] = log(len(self.reviews) / term_count) self.tfidf_dict = dict() for review_id, review in self.reviews.iteritems(): tokens = TextProcess.stemming( TextProcess.stopword(re.findall(r'\w+', review.lower()))) tf = collections.Counter(tokens) review_tfidfs = list() for term, term_count in self.term_counts.iteritems(): if term in tf: review_tfidfs.append((1 + log(tf[term])) * self.idf[term]) else: review_tfidfs.append(0) self.tfidf_dict[review_id] = review_tfidfs
reviewContent = {} print "starting sentiment" cutoff = 0 for line in open('../../data/yelp_academic_dataset_review.json', 'r'): #cutoff+=1 #if cutoff > 10: # break review_json = TextProcess.read_line(line) review_id = review_json['review_id'] chopPoint = review_json['text'][::-1].find(' ', 400) if chopPoint > 400: reviewContent[review_id] = review_json['text'][::-1][:chopPoint][::-1] posScore = classifier.posScore(word_features(TextProcess.tokenize(review_json, chopPoint))) negScore = classifier.negScore(word_features(TextProcess.tokenize(review_json, chopPoint))) else: reviewContent[review_id] = review_json['text'] posScore = classifier.posScore(word_features(TextProcess.tokenize(review_json, len(review_json['text'])))) negScore = classifier.negScore(word_features(TextProcess.tokenize(review_json, len(review_json['text'])))) reviewList.append((review_id,posScore)) confList.append((review_id, abs(posScore-negScore))) print "done with sentiment" #classifier.show_most_informative_features(200) #for pair in classifier.most_informative_features(200): # print "\"" + pair[0] + "\"" + ','