class FeaturesExtractor(object): def __init__(self, user): self.user = user self.preprocessor = Preprocessor(self.user.tweets_text) self.features = [] self.features.append(user.followers_count) self.features.append(user.following_count) self.features.append(user.total_tweets) self.features.append(user.mentions_count) self.features.append(user.replies_count) self.features.append(user.hashtags_count) self.features.append(user.links_count) self.features.append(user.words_count) self.features.append(user.tweets_count) tokens = self.preprocessor.remove_stop_words_and_urls() self.features.append(len(tokens)) sentiment = self.preprocessor.vader() for k in sorted(sentiment): self.features.append(sentiment[k]) mrc_location = os.getcwd() + '/resources/mrc/1054' mrc = MrcService(mrc_location) self.features.extend(mrc.get_vector(tokens)) def get_features(self): return self.features
def predict(self, username): prediction_dict = {} solr_user = self.get_solr_user(username) if solr_user: print 'got data' prediction_dict = solr_user else: user = self.ie.extract(username, self.tweet_count) user_fe = FeaturesExtractor(user) user_features = user_fe.get_features() predicted = self.pp.predict(user_features) preprocessor = Preprocessor(user.tweets_text) prediction_dict = { username: { 'f': user_features, 'o': predicted['o'], 'c': predicted['c'], 'e': predicted['e'], 'a': predicted['a'], 'n': predicted['n'], 'tweets': user.tweets_text, 'top_words': preprocessor.most_used_words(max_count=self.search_config['top_words']), 'hashtags': preprocessor.most_used_hashtags(max_count=self.search_config['top_hashtags']), 'bigrams': preprocessor.most_used_bigrams(max_count=self.search_config['top_bigrams']) } } if prediction_dict[username]['hashtags'] == '': prediction_dict[username]['hashtags'] = '$' if prediction_dict[username]['bigrams'] == '': prediction_dict[username]['bigrams'] = '$' if prediction_dict[username]['top_words'] == '': prediction_dict[username]['top_words'] = '$' if prediction_dict[username]['tweets'] == '': prediction_dict[username]['tweets'] = '$' solr_dict = FeaturesConverter.convert_features_to_solr(prediction_dict) self.solr.addUser(solr_dict[0]) similar = self.solr.getSimilarUsers(username, tf=self.search_config['tf'], df=self.search_config['df'], count=self.search_config['similar']) prediction_dict[username].update({ 'similar': similar, 'top_words': self.split_data(prediction_dict[username]['top_words']), 'hashtags': self.split_data(prediction_dict[username]['hashtags']), 'bigrams': self.split_bigrams(prediction_dict[username]['bigrams']), }) return prediction_dict
def load_users(self): i = 0 to_remove = [] for username in self.data: if i == self.train_count: break solr_data = self.get_solr_user(username) if solr_data: self.data[username] = solr_data[username] i += 1 continue try: user = self.ie.extract(username, self.tweet_count) fe = FeaturesExtractor(user) self.data[username]['f'] = fe.get_features() if user.tweets_text == '': continue self.data[username]['tweets'] = user.tweets_text preprocessor = Preprocessor(user.tweets_text) self.data[username]['top_words'] = preprocessor.most_used_words(max_count=self.search_config['top_words']) self.data[username]['hashtags'] = preprocessor.most_used_hashtags(max_count=self.search_config['top_hashtags']) self.data[username]['bigrams'] = preprocessor.most_used_bigrams(max_count=self.search_config['top_bigrams']) if self.data[username]['hashtags'] == '': self.data[username]['hashtags'] = '$' if self.data[username]['bigrams'] == '': self.data[username]['bigrams'] = '$' if self.data[username]['top_words'] == '': self.data[username]['top_words'] = '$' solr_dict = FeaturesConverter.convert_features_to_solr({username: self.data[username]}) self.solr.addUser(solr_dict[0]) i += 1 except ZeroDivisionError: if username in self.data: to_remove.append(username) except TweepError: if username in self.data: to_remove.append(username) for uname in to_remove: if uname in self.data: del self.data[uname]