Exemple #1
0
    def __init__(self, a_document_name):
        # term counts: number of documents that contain a given term (key)
        self.term_counts = dict()
        self.reviews = dict()

        file = open(a_document_name)

        for review in file:
            review_id = TextProcess.read_line(review)['review_id'].encode('utf8')
            review_content = TextProcess.read_line(review)['text'].encode('utf8')
            self.reviews[review_id] = review_content
            tokens = TextProcess.stemming(TextProcess.stopword(TextProcess.tokenize(TextProcess.read_line(review))))
            unique_tokens = list(set(tokens))
            for token in unique_tokens:
                if token in self.term_counts:
                    self.term_counts[token] += 1
                else:
                    self.term_counts[token] = 1

        self.idf = dict()
        for term, term_count in self.term_counts.iteritems():
            self.idf[term] = log(len(self.reviews)/term_count)

        self.tfidf_dict = dict()
        for review_id, review in self.reviews.iteritems():
            tokens = TextProcess.stemming(TextProcess.stopword(re.findall(r'\w+',review.lower())))
            tf = collections.Counter(tokens)
            review_tfidfs = list()
            for term, term_count in self.term_counts.iteritems():
                if term in tf:
                    review_tfidfs.append((1+log(tf[term]))*self.idf[term])
                else:
                    review_tfidfs.append(0)
            self.tfidf_dict[review_id] = review_tfidfs
Exemple #2
0
 def calc_tweet_scores(self):
         '''calculate relevancy of tweets
                         - 20% from favorite count
                         - 30% from retweet count
                         - 30% from number of followers_count
                         - 20% from user's total number of tweets
            (these amounts may be adjusted later)'''
         #score = 0
         with open('test.json','w') as f1:
                 json.dump(self.tweets,f1)
         for word in self.tweets:
                 #print word
                 sortedList =[]
                 count = 0
                 totalscore = 0
                 s = self.tweets[word]
                 with open('test2.json','w') as f1:
                         json.dump(s,f1)
                 for t in self.tweets[word]:
                         # adds up to 100%
                         #print t
                         #print stuff
                         #for t in stuff:    
                         #print "t is:"
                         #print t['favorite_count']
                         print count
                         #t = s[count]
                         if word in t['text']:
                                 #print word
                                 #print unicode(t['text'])
                                 score = 0
                                 score += math.log(t['favorite_count']+1,2) * 0.25
                                 score += math.log(t['retweet_count']+1,2) * 0.025
                                 score += math.log(t['user']['followers_count']+1,2) * 0.05
                                 #score += math.log(t['user']['statuses_count']+1,2) * 0.05
                                 totalscore += score
                                 #stemming the texts
                                 tokens = TextProcess.tokenize(t['text'])
                                 #list_of_stem_words = TextProcess.stemming(tokens)
                                 text = ' '.join(tokens).strip()
                                 self.scores.append({ 'id': t['id'], 'text':unicode(text,errors='ignore'), 'score' : score, 'geo':t['geo']['coordinates']})
                                 #print self.scores
                         count+=1
                 if (totalscore >=0):
                         sortedList = sorted(self.scores, key = lambda k: k['score'], reverse=True)[0:100]
                         sortedList2 = sorted(sortedList, key = lambda k: k['geo'][1], reverse=True)
                         if word not in self.sortedTweets:
                                 self.sortedTweets[word] = sortedList2
                         else:
                                 self.sortedTweets[word] += sortedList2
                         self.scores = []
Exemple #3
0
    def __init__(self, a_document_name):
        # term counts: number of documents that contain a given term (key)
        self.term_counts = dict()
        self.reviews = dict()

        file = open(a_document_name)

        for review in file:
            review_id = TextProcess.read_line(review)['review_id'].encode(
                'utf8')
            review_content = TextProcess.read_line(review)['text'].encode(
                'utf8')
            self.reviews[review_id] = review_content
            tokens = TextProcess.stemming(
                TextProcess.stopword(
                    TextProcess.tokenize(TextProcess.read_line(review))))
            unique_tokens = list(set(tokens))
            for token in unique_tokens:
                if token in self.term_counts:
                    self.term_counts[token] += 1
                else:
                    self.term_counts[token] = 1

        self.idf = dict()
        for term, term_count in self.term_counts.iteritems():
            self.idf[term] = log(len(self.reviews) / term_count)

        self.tfidf_dict = dict()
        for review_id, review in self.reviews.iteritems():
            tokens = TextProcess.stemming(
                TextProcess.stopword(re.findall(r'\w+', review.lower())))
            tf = collections.Counter(tokens)
            review_tfidfs = list()
            for term, term_count in self.term_counts.iteritems():
                if term in tf:
                    review_tfidfs.append((1 + log(tf[term])) * self.idf[term])
                else:
                    review_tfidfs.append(0)
            self.tfidf_dict[review_id] = review_tfidfs
Exemple #4
0
reviewContent = {}
print "starting sentiment"

cutoff = 0
for line in open('../../data/yelp_academic_dataset_review.json', 'r'):

    #cutoff+=1
    #if cutoff > 10:
    #    break

    review_json = TextProcess.read_line(line)
    review_id = review_json['review_id']
    chopPoint = review_json['text'][::-1].find(' ', 400)
    if chopPoint > 400:
        reviewContent[review_id] = review_json['text'][::-1][:chopPoint][::-1]
        posScore = classifier.posScore(word_features(TextProcess.tokenize(review_json, chopPoint)))
        negScore = classifier.negScore(word_features(TextProcess.tokenize(review_json, chopPoint)))
    else:
        reviewContent[review_id] = review_json['text']
        posScore = classifier.posScore(word_features(TextProcess.tokenize(review_json, len(review_json['text']))))
        negScore = classifier.negScore(word_features(TextProcess.tokenize(review_json, len(review_json['text']))))

    reviewList.append((review_id,posScore))
    confList.append((review_id, abs(posScore-negScore)))

print "done with sentiment"
#classifier.show_most_informative_features(200)

#for pair in classifier.most_informative_features(200):
#    print "\"" + pair[0] + "\"" + ','