class FeaturesExtractor(object):
    def __init__(self, user):
        self.user = user
        self.preprocessor = Preprocessor(self.user.tweets_text)

        self.features = []
        self.features.append(user.followers_count)
        self.features.append(user.following_count)
        self.features.append(user.total_tweets)
        self.features.append(user.mentions_count)
        self.features.append(user.replies_count)
        self.features.append(user.hashtags_count)
        self.features.append(user.links_count)
        self.features.append(user.words_count)
        self.features.append(user.tweets_count)

        tokens = self.preprocessor.remove_stop_words_and_urls()
        self.features.append(len(tokens))

        sentiment = self.preprocessor.vader()
        for k in sorted(sentiment):
            self.features.append(sentiment[k])

        mrc_location = os.getcwd() + '/resources/mrc/1054'
        mrc = MrcService(mrc_location)
        self.features.extend(mrc.get_vector(tokens))

    def get_features(self):
        return self.features
Esempio n. 2
0
 def predict(self, username):
     prediction_dict = {}
     solr_user =  self.get_solr_user(username)
     if solr_user:
         print 'got data'
         prediction_dict = solr_user
     else:
         user = self.ie.extract(username, self.tweet_count)
         user_fe = FeaturesExtractor(user)
         user_features = user_fe.get_features()
         predicted = self.pp.predict(user_features)
         preprocessor = Preprocessor(user.tweets_text)
         prediction_dict = {
              username: {
                  'f': user_features,
                  'o': predicted['o'],
                  'c': predicted['c'],
                  'e': predicted['e'],
                  'a': predicted['a'],
                  'n': predicted['n'],
                  'tweets': user.tweets_text,
                  'top_words': preprocessor.most_used_words(max_count=self.search_config['top_words']),
                  'hashtags': preprocessor.most_used_hashtags(max_count=self.search_config['top_hashtags']),
                  'bigrams': preprocessor.most_used_bigrams(max_count=self.search_config['top_bigrams'])
                  }
              }
         if prediction_dict[username]['hashtags'] == '':
             prediction_dict[username]['hashtags'] = '$'
         if prediction_dict[username]['bigrams'] == '':
             prediction_dict[username]['bigrams'] = '$'
         if prediction_dict[username]['top_words'] == '':
             prediction_dict[username]['top_words'] = '$'
         if prediction_dict[username]['tweets'] == '':
             prediction_dict[username]['tweets'] = '$'
         solr_dict = FeaturesConverter.convert_features_to_solr(prediction_dict)
         self.solr.addUser(solr_dict[0])
     similar = self.solr.getSimilarUsers(username, tf=self.search_config['tf'], df=self.search_config['df'], count=self.search_config['similar'])
     prediction_dict[username].update({
         'similar': similar,
         'top_words': self.split_data(prediction_dict[username]['top_words']),
         'hashtags': self.split_data(prediction_dict[username]['hashtags']),
         'bigrams': self.split_bigrams(prediction_dict[username]['bigrams']),
         })
     return prediction_dict
Esempio n. 3
0
    def load_users(self):
        i = 0
        to_remove = []
        for username in self.data:
            if i == self.train_count:
                break
            solr_data = self.get_solr_user(username)
            if solr_data:
                self.data[username] = solr_data[username]
                i += 1
                continue

            try:
                user = self.ie.extract(username, self.tweet_count)
                fe = FeaturesExtractor(user)
                self.data[username]['f'] = fe.get_features()
                if user.tweets_text == '':
                    continue
                self.data[username]['tweets'] = user.tweets_text
                preprocessor = Preprocessor(user.tweets_text)
                self.data[username]['top_words'] = preprocessor.most_used_words(max_count=self.search_config['top_words'])
                self.data[username]['hashtags'] = preprocessor.most_used_hashtags(max_count=self.search_config['top_hashtags'])
                self.data[username]['bigrams'] = preprocessor.most_used_bigrams(max_count=self.search_config['top_bigrams'])
                if self.data[username]['hashtags'] == '':
                    self.data[username]['hashtags'] = '$'
                if self.data[username]['bigrams'] == '':
                    self.data[username]['bigrams'] = '$'
                if self.data[username]['top_words'] == '':
                    self.data[username]['top_words'] = '$'
                solr_dict = FeaturesConverter.convert_features_to_solr({username: self.data[username]})
                self.solr.addUser(solr_dict[0])
                i += 1
            except ZeroDivisionError:
                if username in self.data:
                    to_remove.append(username)
            except TweepError:
                if username in self.data:
                    to_remove.append(username)
        for uname in to_remove:
            if uname in self.data:
                del self.data[uname]